Browse Source

Merge branch 'linus'

Trond Myklebust 19 years ago
parent
commit
ac58c9059d
100 changed files with 9757 additions and 4626 deletions
  1. 1 1
      Documentation/DocBook/Makefile
  2. 0 585
      Documentation/DocBook/sis900.tmpl
  3. 7 0
      Documentation/feature-removal-schedule.txt
  4. 0 2
      Documentation/networking/00-INDEX
  5. 6 6
      Documentation/networking/README.ipw2100
  6. 42 2
      Documentation/networking/README.ipw2200
  7. 0 257
      Documentation/networking/sis900.txt
  8. 2 3
      arch/ppc/platforms/hdpu.c
  9. 15 25
      arch/sparc/kernel/ioport.c
  10. 18 0
      arch/sparc64/Kconfig
  11. 14 11
      arch/sparc64/defconfig
  12. 5 3
      arch/sparc64/kernel/Makefile
  13. 5 9
      arch/sparc64/kernel/binfmt_aout32.c
  14. 3 1
      arch/sparc64/kernel/binfmt_elf32.c
  15. 7 0
      arch/sparc64/kernel/cpu.c
  16. 179 10
      arch/sparc64/kernel/devices.c
  17. 0 170
      arch/sparc64/kernel/dtlb_backend.S
  18. 0 109
      arch/sparc64/kernel/dtlb_base.S
  19. 39 0
      arch/sparc64/kernel/dtlb_miss.S
  20. 1 2
      arch/sparc64/kernel/ebus.c
  21. 232 99
      arch/sparc64/kernel/entry.S
  22. 72 98
      arch/sparc64/kernel/etrap.S
  23. 172 82
      arch/sparc64/kernel/head.S
  24. 252 87
      arch/sparc64/kernel/irq.c
  25. 0 79
      arch/sparc64/kernel/itlb_base.S
  26. 39 0
      arch/sparc64/kernel/itlb_miss.S
  27. 224 139
      arch/sparc64/kernel/ktlb.S
  28. 13 0
      arch/sparc64/kernel/pci.c
  29. 174 127
      arch/sparc64/kernel/pci_common.c
  30. 23 13
      arch/sparc64/kernel/pci_iommu.c
  31. 10 13
      arch/sparc64/kernel/pci_psycho.c
  32. 10 13
      arch/sparc64/kernel/pci_sabre.c
  33. 10 14
      arch/sparc64/kernel/pci_schizo.c
  34. 1147 0
      arch/sparc64/kernel/pci_sun4v.c
  35. 31 0
      arch/sparc64/kernel/pci_sun4v.h
  36. 95 0
      arch/sparc64/kernel/pci_sun4v_asm.S
  37. 57 76
      arch/sparc64/kernel/process.c
  38. 3 0
      arch/sparc64/kernel/ptrace.c
  39. 105 10
      arch/sparc64/kernel/rtrap.S
  40. 5 5
      arch/sparc64/kernel/sbus.c
  41. 124 285
      arch/sparc64/kernel/setup.c
  42. 314 104
      arch/sparc64/kernel/smp.c
  43. 14 12
      arch/sparc64/kernel/sparc64_ksyms.c
  44. 334 0
      arch/sparc64/kernel/sun4v_ivec.S
  45. 421 0
      arch/sparc64/kernel/sun4v_tlb_miss.S
  46. 263 34
      arch/sparc64/kernel/sys_sparc.c
  47. 5 4
      arch/sparc64/kernel/sys_sparc32.c
  48. 348 25
      arch/sparc64/kernel/time.c
  49. 163 75
      arch/sparc64/kernel/trampoline.S
  50. 388 15
      arch/sparc64/kernel/traps.c
  51. 442 0
      arch/sparc64/kernel/tsb.S
  52. 25 38
      arch/sparc64/kernel/ttable.S
  53. 34 11
      arch/sparc64/kernel/unaligned.c
  54. 5 6
      arch/sparc64/kernel/us2e_cpufreq.c
  55. 5 6
      arch/sparc64/kernel/us3_cpufreq.c
  56. 894 0
      arch/sparc64/kernel/visemul.c
  57. 16 0
      arch/sparc64/kernel/vmlinux.lds.S
  58. 124 356
      arch/sparc64/kernel/winfixup.S
  59. 2 0
      arch/sparc64/lib/Makefile
  60. 163 0
      arch/sparc64/lib/NGbzero.S
  61. 37 0
      arch/sparc64/lib/NGcopy_from_user.S
  62. 40 0
      arch/sparc64/lib/NGcopy_to_user.S
  63. 368 0
      arch/sparc64/lib/NGmemcpy.S
  64. 96 0
      arch/sparc64/lib/NGpage.S
  65. 33 0
      arch/sparc64/lib/NGpatch.S
  66. 2 1
      arch/sparc64/lib/U3patch.S
  67. 9 9
      arch/sparc64/lib/bzero.S
  68. 5 7
      arch/sparc64/lib/clear_page.S
  69. 2 5
      arch/sparc64/lib/copy_page.S
  70. 8 11
      arch/sparc64/lib/delay.c
  71. 299 1
      arch/sparc64/lib/xor.S
  72. 22 2
      arch/sparc64/math-emu/math.c
  73. 1 1
      arch/sparc64/mm/Makefile
  74. 11 4
      arch/sparc64/mm/fault.c
  75. 18 22
      arch/sparc64/mm/generic.c
  76. 175 4
      arch/sparc64/mm/hugetlbpage.c
  77. 245 603
      arch/sparc64/mm/init.c
  78. 4 60
      arch/sparc64/mm/tlb.c
  79. 440 0
      arch/sparc64/mm/tsb.c
  80. 293 81
      arch/sparc64/mm/ultra.S
  81. 17 194
      arch/sparc64/prom/cif.S
  82. 6 0
      arch/sparc64/prom/console.c
  83. 8 52
      arch/sparc64/prom/init.c
  84. 24 20
      arch/sparc64/prom/misc.c
  85. 0 11
      arch/sparc64/prom/p1275.c
  86. 6 3
      arch/sparc64/prom/tree.c
  87. 2 2
      arch/sparc64/solaris/misc.c
  88. 49 95
      block/as-iosched.c
  89. 175 179
      block/cfq-iosched.c
  90. 27 89
      block/deadline-iosched.c
  91. 118 53
      block/elevator.c
  92. 72 33
      block/ll_rw_blk.c
  93. 2 2
      drivers/block/loop.c
  94. 2 2
      drivers/block/pktcdvd.c
  95. 1 1
      drivers/block/umem.c
  96. 2 2
      drivers/md/dm.c
  97. 4 1
      drivers/md/md.c
  98. 23 47
      drivers/net/3c509.c
  99. 5 4
      drivers/net/3c523.c
  100. 4 3
      drivers/net/3c59x.c

+ 1 - 1
Documentation/DocBook/Makefile

@@ -9,7 +9,7 @@
 DOCBOOKS := wanbook.xml z8530book.xml mcabook.xml videobook.xml \
 DOCBOOKS := wanbook.xml z8530book.xml mcabook.xml videobook.xml \
 	    kernel-hacking.xml kernel-locking.xml deviceiobook.xml \
 	    kernel-hacking.xml kernel-locking.xml deviceiobook.xml \
 	    procfs-guide.xml writing_usb_driver.xml \
 	    procfs-guide.xml writing_usb_driver.xml \
-	    sis900.xml kernel-api.xml journal-api.xml lsm.xml usb.xml \
+	    kernel-api.xml journal-api.xml lsm.xml usb.xml \
 	    gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml
 	    gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml
 
 
 ###
 ###

+ 0 - 585
Documentation/DocBook/sis900.tmpl

@@ -1,585 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
-	"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
-
-<book id="SiS900Guide">
-
-<bookinfo>
-
-<title>SiS 900/7016 Fast Ethernet Device Driver</title>
-
-<authorgroup>
-<author>
-<firstname>Ollie</firstname>
-<surname>Lho</surname>
-</author>
-
-<author>
-<firstname>Lei Chun</firstname>
-<surname>Chang</surname>
-</author>
-</authorgroup>
-
-<edition>Document Revision: 0.3 for SiS900 driver v1.06 &amp; v1.07</edition>
-<pubdate>November 16, 2000</pubdate>
-
-<copyright>
- <year>1999</year>
- <holder>Silicon Integrated System Corp.</holder>
-</copyright>
-
-<legalnotice>
- <para>
-	This program is free software; you can redistribute it and/or modify
-	it under the terms of the GNU General Public License as published by
-	the Free Software Foundation; either version 2 of the License, or
-	(at your option) any later version.
- </para>
-
- <para>
-	This program is distributed in the hope that it will be useful,
-	but WITHOUT ANY WARRANTY; without even the implied warranty of
-	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-	GNU General Public License for more details.
- </para>
-
- <para>
-	You should have received a copy of the GNU General Public License
-	along with this program; if not, write to the Free Software
-	Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- </para>
-</legalnotice>
-
-<abstract>
-<para>
-This document gives some information on installation and usage of SiS 900/7016
-device driver under Linux.
-</para>
-</abstract>
-
-</bookinfo>
-
-<toc></toc>
-
-<chapter id="intro">
- <title>Introduction</title>
-
-<para>
-This document describes the revision 1.06 and 1.07 of SiS 900/7016 Fast Ethernet 
-device driver under Linux. The driver is developed by Silicon Integrated
-System Corp. and distributed freely under the GNU General Public License (GPL).
-The driver can be compiled as a loadable module and used under Linux kernel 
-version 2.2.x. (rev. 1.06)
-With minimal changes, the driver can also be used under 2.3.x and 2.4.x kernel 
-(rev. 1.07), please see 
-<xref linkend="install"/>. If you are intended to
-use the driver for earlier kernels, you are on your own.
-</para>
-
-<para>
-The driver is tested with usual TCP/IP applications including
-FTP, Telnet, Netscape etc. and is used constantly by the developers.
-</para>
-
-<para>
-Please send all comments/fixes/questions to
-<ulink url="mailto:lcchang@sis.com.tw">Lei-Chun Chang</ulink>.
-</para>
-</chapter>
-
-<chapter id="changes">
- <title>Changes</title>
-
-<para>
-Changes made in Revision 1.07
-
-<orderedlist>
-<listitem>
-<para>
-Separation of sis900.c and sis900.h in order to move most 
-constant definition to sis900.h (many of those constants were
-corrected)
-</para>
-</listitem>
-
-<listitem>
-<para>
-Clean up PCI detection, the pci-scan from Donald Becker were not used,
-just simple pci&lowbar;find&lowbar;*.
-</para>
-</listitem>
-
-<listitem>
-<para>
-MII detection is modified to support multiple mii transceiver.
-</para>
-</listitem>
-
-<listitem>
-<para>
-Bugs in read&lowbar;eeprom, mdio&lowbar;* were removed.
-</para>
-</listitem>
-
-<listitem>
-<para>
-Lot of sis900 irrelevant comments were removed/changed and
-more comments were added to reflect the real situation.
-</para>
-</listitem>
-
-<listitem>
-<para>
-Clean up of physical/virtual address space mess in buffer 
-descriptors.
-</para>
-</listitem>
-
-<listitem>
-<para>
-Better transmit/receive error handling.
-</para>
-</listitem>
-
-<listitem>
-<para>
-The driver now uses zero-copy single buffer management
-scheme to improve performance.
-</para>
-</listitem>
-
-<listitem>
-<para>
-Names of variables were changed to be more consistent.
-</para>
-</listitem>
-
-<listitem>
-<para>
-Clean up of auo-negotiation and timer code.
-</para>
-</listitem>
-
-<listitem>
-<para>
-Automatic detection and change of PHY on the fly.
-</para>
-</listitem>
-
-<listitem>
-<para>
-Bug in mac probing fixed.
-</para>
-</listitem>
-
-<listitem>
-<para>
-Fix 630E equalier problem by modifying the equalizer workaround rule.
-</para>
-</listitem>
-
-<listitem>
-<para>
-Support for ICS1893 10/100 Interated PHYceiver.
-</para>
-</listitem>
-
-<listitem>
-<para>
-Support for media select by ifconfig.
-</para>
-</listitem>
-
-<listitem>
-<para>
-Added kernel-doc extratable documentation.
-</para>
-</listitem>
-
-</orderedlist>
-</para>
-</chapter>
-
-<chapter id="tested">
- <title>Tested Environment</title>
-
-<para>
-This driver is developed on the following hardware
-
-<itemizedlist>
-<listitem>
-
-<para>
-Intel Celeron 500 with SiS 630 (rev 02) chipset
-</para>
-</listitem>
-<listitem>
-
-<para>
-SiS 900 (rev 01) and SiS 7016/7014 Fast Ethernet Card
-</para>
-</listitem>
-
-</itemizedlist>
-
-and tested with these software environments
-
-<itemizedlist>
-<listitem>
-
-<para>
-Red Hat Linux version 6.2
-</para>
-</listitem>
-<listitem>
-
-<para>
-Linux kernel version 2.4.0
-</para>
-</listitem>
-<listitem>
-
-<para>
-Netscape version 4.6
-</para>
-</listitem>
-<listitem>
-
-<para>
-NcFTP 3.0.0 beta 18
-</para>
-</listitem>
-<listitem>
-
-<para>
-Samba version 2.0.3
-</para>
-</listitem>
-
-</itemizedlist>
-
-</para>
-
-</chapter>
-
-<chapter id="files">
-<title>Files in This Package</title>
-
-<para>
-In the package you can find these files:
-</para>
-
-<para>
-<variablelist>
-
-<varlistentry>
-<term>sis900.c</term>
-<listitem>
-<para>
-Driver source file in C 
-</para>
-</listitem>
-</varlistentry>
-
-<varlistentry>
-<term>sis900.h</term>
-<listitem>
-<para>
-Header file for sis900.c
-</para>
-</listitem>
-</varlistentry>
-
-<varlistentry>
-<term>sis900.sgml</term>
-<listitem>
-<para>
-DocBook SGML source of the document
-</para>
-</listitem>
-</varlistentry>
-
-<varlistentry>
-<term>sis900.txt</term>
-<listitem>
-<para>
-Driver document in plain text
-</para>
-</listitem>
-</varlistentry>
-
-</variablelist>
-</para>
-</chapter>
-
-<chapter id="install">
- <title>Installation</title>
-
-<para>
-Silicon Integrated System Corp. is cooperating closely with core Linux Kernel
-developers. The revisions of SiS 900 driver are distributed by the usuall channels
-for kernel tar files and patches. Those kernel tar files for official kernel and
-patches for kernel pre-release can be download at 
-<ulink url="http://ftp.kernel.org/pub/linux/kernel/">official kernel ftp site</ulink>
-and its mirrors.
-The 1.06 revision can be found in kernel version later than 2.3.15 and pre-2.2.14, 
-and 1.07 revision can be found in kernel version 2.4.0.
-If you have no prior experience in networking under Linux, please read
-<ulink url="http://www.tldp.org/">Ethernet HOWTO</ulink> and
-<ulink url="http://www.tldp.org/">Networking HOWTO</ulink> available from
-Linux Documentation Project (LDP).
-</para>
-
-<para>
-The driver is bundled in release later than 2.2.11 and 2.3.15 so this 
-is the most easy case. 
-Be sure you have the appropriate packages for compiling kernel source.
-Those packages are listed in Document/Changes in kernel source 
-distribution. If you have to install the driver other than those bundled
-in kernel release, you should have your driver file 
-<filename>sis900.c</filename> and <filename>sis900.h</filename> 
-copied into <filename class="directory">/usr/src/linux/drivers/net/</filename> first.
-There are two alternative ways to install the driver
-</para>
-
-<sect1>
-<title>Building the driver as loadable module</title>
-
-<para>
-To build the driver as a loadable kernel module you have to reconfigure
-the kernel to activate network support by
-</para>
-
-<para><screen>
-make menuconfig
-</screen></para>
-
-<para>
-Choose <quote>Loadable module support  ---></quote>, 
-then select <quote>Enable loadable module support</quote>.
-</para>
-
-<para>
-Choose <quote>Network Device Support  ---></quote>, select 
-<quote>Ethernet (10 or 100Mbit)</quote>.
-Then select <quote>EISA, VLB, PCI and on board controllers</quote>, 
-and choose <quote>SiS 900/7016 PCI Fast Ethernet Adapter support</quote> 
-to <quote>M</quote>.
-</para>
-
-<para>
-After reconfiguring the kernel, you can make the driver module by
-</para>
-
-<para><screen>
-make modules
-</screen></para>
-
-<para>
-The driver should be compiled with no errors. After compiling the driver,
-the driver can be installed to proper place by
-</para>
-
-<para><screen>
-make modules_install
-</screen></para>
-
-<para>
-Load the driver into kernel by
-</para>
-
-<para><screen>
-insmod sis900
-</screen></para>
-
-<para>
-When loading the driver into memory, some information message can be view by
-</para>
-
-<para>
-<screen>
-dmesg
-</screen>
-
-or
-
-<screen>
-cat /var/log/message
-</screen>
-</para>
-
-<para>
-If the driver is loaded properly you will have messages similar to this:
-</para>
-
-<para><screen>
-sis900.c: v1.07.06  11/07/2000
-eth0: SiS 900 PCI Fast Ethernet at 0xd000, IRQ 10, 00:00:e8:83:7f:a4.
-eth0: SiS 900 Internal MII PHY transceiver found at address 1.
-eth0: Using SiS 900 Internal MII PHY as default
-</screen></para>
-
-<para>
-showing the version of the driver and the results of probing routine.
-</para>
-
-<para>
-Once the driver is loaded, network can be brought up by
-</para>
-
-<para><screen>
-/sbin/ifconfig eth0 IPADDR broadcast BROADCAST netmask NETMASK media TYPE
-</screen></para>
-
-<para>
-where IPADDR, BROADCAST, NETMASK are your IP address, broadcast address and
-netmask respectively. TYPE is used to set medium type used by the device. 
-Typical values are "10baseT"(twisted-pair 10Mbps Ethernet) or "100baseT"
-(twisted-pair 100Mbps Ethernet). For more information on how to configure 
-network interface, please refer to  
-<ulink url="http://www.tldp.org/">Networking HOWTO</ulink>.
-</para>
-
-<para>
-The link status is also shown by kernel messages. For example, after the
-network interface is activated, you may have the message:
-</para>
-
-<para><screen>
-eth0: Media Link On 100mbps full-duplex
-</screen></para>
-
-<para>
-If you try to unplug the twist pair (TP) cable you will get
-</para>
-
-<para><screen>
-eth0: Media Link Off
-</screen></para>
-
-<para>
-indicating that the link is failed.
-</para>
-</sect1>
-
-<sect1>
-<title>Building the driver into kernel</title>
-
-<para>
-If you want to make the driver into kernel, choose <quote>Y</quote> 
-rather than <quote>M</quote> on 
-<quote>SiS 900/7016 PCI Fast Ethernet Adapter support</quote> 
-when configuring the kernel. Build the kernel image in the usual way
-</para>
-
-<para><screen>
-make clean
-
-make bzlilo
-</screen></para>
-
-<para>
-Next time the system reboot, you have the driver in memory.
-</para>
-
-</sect1>
-</chapter>
-
-<chapter id="problems">
- <title>Known Problems and Bugs</title>
-
-<para>
-There are some known problems and bugs. If you find any other bugs please 
-mail to <ulink url="mailto:lcchang@sis.com.tw">lcchang@sis.com.tw</ulink>
-
-<orderedlist>
-
-<listitem>
-<para>
-AM79C901 HomePNA PHY is not thoroughly tested, there may be some 
-bugs in the <quote>on the fly</quote> change of transceiver. 
-</para>
-</listitem>
-
-<listitem>
-<para>
-A bug is hidden somewhere in the receive buffer management code, 
-the bug causes NULL pointer reference in the kernel. This fault is 
-caught before bad things happen and reported with the message:
-
-<computeroutput>
-eth0: NULL pointer encountered in Rx ring, skipping 
-</computeroutput>
-
-which can be viewed with <literal remap="tt">dmesg</literal> or
-<literal remap="tt">cat /var/log/message</literal>.
-</para>
-</listitem>
-
-<listitem>
-<para>
-The media type change from 10Mbps to 100Mbps twisted-pair ethernet 
-by ifconfig causes the media link down.
-</para>
-</listitem>
-
-</orderedlist>
-</para>
-</chapter>
-
-<chapter id="RHistory">
- <title>Revision History</title>
-
-<para>
-<itemizedlist>
-
-<listitem>
-<para>
-November 13, 2000, Revision 1.07, seventh release, 630E problem fixed 
-and further clean up.
-</para>
-</listitem>
-
-<listitem>
-<para>
-November 4, 1999, Revision 1.06, Second release, lots of clean up
-and optimization.
-</para>
-</listitem>
-
-<listitem>
-<para>
-August 8, 1999, Revision 1.05, Initial Public Release
-</para>
-</listitem>
-
-</itemizedlist>
-</para>
-</chapter>
-
-<chapter id="acknowledgements">
- <title>Acknowledgements</title>
-
-<para>
-This driver was originally derived form 
-<ulink url="mailto:becker@cesdis1.gsfc.nasa.gov">Donald Becker</ulink>'s
-<ulink url="ftp://cesdis.gsfc.nasa.gov/pub/linux/drivers/kern-2.3/pci-skeleton.c"
->pci-skeleton</ulink> and
-<ulink url="ftp://cesdis.gsfc.nasa.gov/pub/linux/drivers/kern-2.3/rtl8139.c"
->rtl8139</ulink> drivers. Donald also provided various suggestion
-regarded with improvements made in revision 1.06.
-</para>
-
-<para>
-The 1.05 revision was created by 
-<ulink url="mailto:cmhuang@sis.com.tw">Jim Huang</ulink>, AMD 79c901
-support was added by <ulink url="mailto:lcs@sis.com.tw">Chin-Shan Li</ulink>.
-</para>
-</chapter>
-
-<chapter id="functions">
-<title>List of Functions</title>
-!Idrivers/net/sis900.c
-</chapter>
-
-</book>

+ 7 - 0
Documentation/feature-removal-schedule.txt

@@ -151,6 +151,13 @@ Who:	Ralf Baechle <ralf@linux-mips.org>
 
 
 ---------------------------
 ---------------------------
 
 
+What:   eepro100 network driver
+When:   January 2007
+Why:    replaced by the e100 driver
+Who:    Adrian Bunk <bunk@stusta.de>
+
+---------------------------
+
 What:	Legacy /proc/pci interface (PCI_LEGACY_PROC)
 What:	Legacy /proc/pci interface (PCI_LEGACY_PROC)
 When:	March 2006
 When:	March 2006
 Why:	deprecated since 2.5.53 in favor of lspci(8)
 Why:	deprecated since 2.5.53 in favor of lspci(8)

+ 0 - 2
Documentation/networking/00-INDEX

@@ -92,8 +92,6 @@ routing.txt
 	- the new routing mechanism
 	- the new routing mechanism
 shaper.txt
 shaper.txt
 	- info on the module that can shape/limit transmitted traffic.
 	- info on the module that can shape/limit transmitted traffic.
-sis900.txt
-	- SiS 900/7016 Fast Ethernet device driver info.
 sk98lin.txt
 sk98lin.txt
 	- Marvell Yukon Chipset / SysKonnect SK-98xx compliant Gigabit
 	- Marvell Yukon Chipset / SysKonnect SK-98xx compliant Gigabit
 	  Ethernet Adapter family driver info
 	  Ethernet Adapter family driver info

+ 6 - 6
Documentation/networking/README.ipw2100

@@ -3,18 +3,18 @@ Intel(R) PRO/Wireless 2100 Driver for Linux in support of:
 
 
 Intel(R) PRO/Wireless 2100 Network Connection
 Intel(R) PRO/Wireless 2100 Network Connection
 
 
-Copyright (C) 2003-2005, Intel Corporation
+Copyright (C) 2003-2006, Intel Corporation
 
 
 README.ipw2100
 README.ipw2100
 
 
-Version: 1.1.3
-Date   : October 17, 2005
+Version: git-1.1.5
+Date   : January 25, 2006
 
 
 Index
 Index
 -----------------------------------------------
 -----------------------------------------------
 0. IMPORTANT INFORMATION BEFORE USING THIS DRIVER
 0. IMPORTANT INFORMATION BEFORE USING THIS DRIVER
 1. Introduction
 1. Introduction
-2. Release 1.1.3 Current Features
+2. Release git-1.1.5 Current Features
 3. Command Line Parameters
 3. Command Line Parameters
 4. Sysfs Helper Files
 4. Sysfs Helper Files
 5. Radio Kill Switch
 5. Radio Kill Switch
@@ -89,7 +89,7 @@ potential fixes and patches, as well as links to the development mailing list
 for the driver project.
 for the driver project.
 
 
 
 
-2. Release 1.1.3 Current Supported Features
+2. Release git-1.1.5 Current Supported Features
 -----------------------------------------------
 -----------------------------------------------
 - Managed (BSS) and Ad-Hoc (IBSS)
 - Managed (BSS) and Ad-Hoc (IBSS)
 - WEP (shared key and open)
 - WEP (shared key and open)
@@ -270,7 +270,7 @@ For installation support on the ipw2100 1.1.0 driver on Linux kernels
 9. License
 9. License
 -----------------------------------------------
 -----------------------------------------------
 
 
-  Copyright(c) 2003 - 2005 Intel Corporation. All rights reserved.
+  Copyright(c) 2003 - 2006 Intel Corporation. All rights reserved.
 
 
   This program is free software; you can redistribute it and/or modify it 
   This program is free software; you can redistribute it and/or modify it 
   under the terms of the GNU General Public License (version 2) as 
   under the terms of the GNU General Public License (version 2) as 

+ 42 - 2
Documentation/networking/README.ipw2200

@@ -10,7 +10,7 @@ both hardware adapters listed above. In this document the Intel(R)
 PRO/Wireless 2915ABG Driver for Linux will be used to reference the
 PRO/Wireless 2915ABG Driver for Linux will be used to reference the
 unified driver.
 unified driver.
 
 
-Copyright (C) 2004-2005, Intel Corporation
+Copyright (C) 2004-2006, Intel Corporation
 
 
 README.ipw2200
 README.ipw2200
 
 
@@ -26,9 +26,11 @@ Index
 1.2. Module parameters
 1.2. Module parameters
 1.3. Wireless Extension Private Methods
 1.3. Wireless Extension Private Methods
 1.4. Sysfs Helper Files
 1.4. Sysfs Helper Files
+1.5. Supported channels
 2.   Ad-Hoc Networking
 2.   Ad-Hoc Networking
 3.   Interacting with Wireless Tools
 3.   Interacting with Wireless Tools
 3.1. iwconfig mode
 3.1. iwconfig mode
+3.2. iwconfig sens
 4.   About the Version Numbers
 4.   About the Version Numbers
 5.   Firmware installation
 5.   Firmware installation
 6.   Support
 6.   Support
@@ -314,6 +316,35 @@ For the device level files, see /sys/bus/pci/drivers/ipw2200:
 	running ifconfig and is therefore disabled by default.
 	running ifconfig and is therefore disabled by default.
 
 
 
 
+1.5. Supported channels
+-----------------------------------------------
+
+Upon loading the Intel(R) PRO/Wireless 2915ABG Driver for Linux, a
+message stating the detected geography code and the number of 802.11
+channels supported by the card will be displayed in the log.
+
+The geography code corresponds to a regulatory domain as shown in the
+table below.
+
+					  Supported channels
+Code	Geography			802.11bg	802.11a
+
+---	Restricted			11 	 	 0
+ZZF	Custom US/Canada		11	 	 8
+ZZD	Rest of World			13	 	 0
+ZZA	Custom USA & Europe & High	11		13
+ZZB	Custom NA & Europe    		11		13
+ZZC	Custom Japan			11	 	 4
+ZZM	Custom 				11	 	 0
+ZZE	Europe				13		19
+ZZJ	Custom Japan			14	 	 4
+ZZR	Rest of World			14	 	 0
+ZZH	High Band			13	 	 4
+ZZG	Custom Europe			13	 	 4
+ZZK	Europe 				13		24
+ZZL	Europe				11		13
+
+
 2.   Ad-Hoc Networking
 2.   Ad-Hoc Networking
 -----------------------------------------------
 -----------------------------------------------
 
 
@@ -353,6 +384,15 @@ When configuring the mode of the adapter, all run-time configured parameters
 are reset to the value used when the module was loaded.  This includes
 are reset to the value used when the module was loaded.  This includes
 channels, rates, ESSID, etc.
 channels, rates, ESSID, etc.
 
 
+3.2 iwconfig sens
+-----------------------------------------------
+
+The 'iwconfig ethX sens XX' command will not set the signal sensitivity
+threshold, as described in iwconfig documentation, but rather the number
+of consecutive missed beacons that will trigger handover, i.e. roaming
+to another access point. At the same time, it will set the disassociation
+threshold to 3 times the given value.
+
 
 
 4.   About the Version Numbers
 4.   About the Version Numbers
 -----------------------------------------------
 -----------------------------------------------
@@ -408,7 +448,7 @@ For general information and support, go to:
 7.  License
 7.  License
 -----------------------------------------------
 -----------------------------------------------
 
 
-  Copyright(c) 2003 - 2005 Intel Corporation. All rights reserved.
+  Copyright(c) 2003 - 2006 Intel Corporation. All rights reserved.
 
 
   This program is free software; you can redistribute it and/or modify it 
   This program is free software; you can redistribute it and/or modify it 
   under the terms of the GNU General Public License version 2 as 
   under the terms of the GNU General Public License version 2 as 

+ 0 - 257
Documentation/networking/sis900.txt

@@ -1,257 +0,0 @@
-
-SiS 900/7016 Fast Ethernet Device Driver
-
-Ollie Lho
-
-Lei Chun Chang
-
-   Copyright © 1999 by Silicon Integrated System Corp.
-   
-   This document gives some information on installation and usage of SiS
-   900/7016 device driver under Linux.
-   
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 2 of the License, or (at
-   your option) any later version.
-   
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-   General Public License for more details.
-   
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-   USA
-     _________________________________________________________________
-   
-   Table of Contents
-   1. Introduction
-   2. Changes
-   3. Tested Environment
-   4. Files in This Package
-   5. Installation
-          
-        Building the driver as loadable module
-        Building the driver into kernel
-                
-   6. Known Problems and Bugs
-   7. Revision History
-   8. Acknowledgements
-     _________________________________________________________________
-   
-Chapter 1. Introduction
-
-   This document describes the revision 1.06 and 1.07 of SiS 900/7016
-   Fast Ethernet device driver under Linux. The driver is developed by
-   Silicon Integrated System Corp. and distributed freely under the GNU
-   General Public License (GPL). The driver can be compiled as a loadable
-   module and used under Linux kernel version 2.2.x. (rev. 1.06) With
-   minimal changes, the driver can also be used under 2.3.x and 2.4.x
-   kernel (rev. 1.07), please see Chapter 5. If you are intended to use
-   the driver for earlier kernels, you are on your own.
-   
-   The driver is tested with usual TCP/IP applications including FTP,
-   Telnet, Netscape etc. and is used constantly by the developers.
-   
-   Please send all comments/fixes/questions to Lei-Chun Chang.
-     _________________________________________________________________
-   
-Chapter 2. Changes
-
-   Changes made in Revision 1.07
-   
-    1. Separation of sis900.c and sis900.h in order to move most constant
-       definition to sis900.h (many of those constants were corrected)
-    2. Clean up PCI detection, the pci-scan from Donald Becker were not
-       used, just simple pci_find_*.
-    3. MII detection is modified to support multiple mii transceiver.
-    4. Bugs in read_eeprom, mdio_* were removed.
-    5. Lot of sis900 irrelevant comments were removed/changed and more
-       comments were added to reflect the real situation.
-    6. Clean up of physical/virtual address space mess in buffer
-       descriptors.
-    7. Better transmit/receive error handling.
-    8. The driver now uses zero-copy single buffer management scheme to
-       improve performance.
-    9. Names of variables were changed to be more consistent.
-   10. Clean up of auo-negotiation and timer code.
-   11. Automatic detection and change of PHY on the fly.
-   12. Bug in mac probing fixed.
-   13. Fix 630E equalier problem by modifying the equalizer workaround
-       rule.
-   14. Support for ICS1893 10/100 Interated PHYceiver.
-   15. Support for media select by ifconfig.
-   16. Added kernel-doc extratable documentation.
-     _________________________________________________________________
-   
-Chapter 3. Tested Environment
-
-   This driver is developed on the following hardware
-   
-     * Intel Celeron 500 with SiS 630 (rev 02) chipset
-     * SiS 900 (rev 01) and SiS 7016/7014 Fast Ethernet Card
-       
-   and tested with these software environments
-   
-     * Red Hat Linux version 6.2
-     * Linux kernel version 2.4.0
-     * Netscape version 4.6
-     * NcFTP 3.0.0 beta 18
-     * Samba version 2.0.3
-     _________________________________________________________________
-   
-Chapter 4. Files in This Package
-
-   In the package you can find these files:
-   
-   sis900.c
-          Driver source file in C
-          
-   sis900.h
-          Header file for sis900.c
-          
-   sis900.sgml
-          DocBook SGML source of the document
-          
-   sis900.txt
-          Driver document in plain text
-     _________________________________________________________________
-   
-Chapter 5. Installation
-
-   Silicon Integrated System Corp. is cooperating closely with core Linux
-   Kernel developers. The revisions of SiS 900 driver are distributed by
-   the usuall channels for kernel tar files and patches. Those kernel tar
-   files for official kernel and patches for kernel pre-release can be
-   download at official kernel ftp site and its mirrors. The 1.06
-   revision can be found in kernel version later than 2.3.15 and
-   pre-2.2.14, and 1.07 revision can be found in kernel version 2.4.0. If
-   you have no prior experience in networking under Linux, please read
-   Ethernet HOWTO and Networking HOWTO available from Linux Documentation
-   Project (LDP).
-   
-   The driver is bundled in release later than 2.2.11 and 2.3.15 so this
-   is the most easy case. Be sure you have the appropriate packages for
-   compiling kernel source. Those packages are listed in Document/Changes
-   in kernel source distribution. If you have to install the driver other
-   than those bundled in kernel release, you should have your driver file
-   sis900.c and sis900.h copied into /usr/src/linux/drivers/net/ first.
-   There are two alternative ways to install the driver
-     _________________________________________________________________
-   
-Building the driver as loadable module
-
-   To build the driver as a loadable kernel module you have to
-   reconfigure the kernel to activate network support by
-   
-make menuconfig
-
-   Choose "Loadable module support --->", then select "Enable loadable
-   module support".
-   
-   Choose "Network Device Support --->", select "Ethernet (10 or
-   100Mbit)". Then select "EISA, VLB, PCI and on board controllers", and
-   choose "SiS 900/7016 PCI Fast Ethernet Adapter support" to "M".
-   
-   After reconfiguring the kernel, you can make the driver module by
-   
-make modules
-
-   The driver should be compiled with no errors. After compiling the
-   driver, the driver can be installed to proper place by
-   
-make modules_install
-
-   Load the driver into kernel by
-   
-insmod sis900
-
-   When loading the driver into memory, some information message can be
-   view by
-   
-dmesg
-
-   or
-cat /var/log/message
-
-   If the driver is loaded properly you will have messages similar to
-   this:
-   
-sis900.c: v1.07.06  11/07/2000
-eth0: SiS 900 PCI Fast Ethernet at 0xd000, IRQ 10, 00:00:e8:83:7f:a4.
-eth0: SiS 900 Internal MII PHY transceiver found at address 1.
-eth0: Using SiS 900 Internal MII PHY as default
-
-   showing the version of the driver and the results of probing routine.
-   
-   Once the driver is loaded, network can be brought up by
-   
-/sbin/ifconfig eth0 IPADDR broadcast BROADCAST netmask NETMASK media TYPE
-
-   where IPADDR, BROADCAST, NETMASK are your IP address, broadcast
-   address and netmask respectively. TYPE is used to set medium type used
-   by the device. Typical values are "10baseT"(twisted-pair 10Mbps
-   Ethernet) or "100baseT" (twisted-pair 100Mbps Ethernet). For more
-   information on how to configure network interface, please refer to
-   Networking HOWTO.
-   
-   The link status is also shown by kernel messages. For example, after
-   the network interface is activated, you may have the message:
-   
-eth0: Media Link On 100mbps full-duplex
-
-   If you try to unplug the twist pair (TP) cable you will get
-   
-eth0: Media Link Off
-
-   indicating that the link is failed.
-     _________________________________________________________________
-   
-Building the driver into kernel
-
-   If you want to make the driver into kernel, choose "Y" rather than "M"
-   on "SiS 900/7016 PCI Fast Ethernet Adapter support" when configuring
-   the kernel. Build the kernel image in the usual way
-   
-make clean
-
-make bzlilo
-
-   Next time the system reboot, you have the driver in memory.
-     _________________________________________________________________
-   
-Chapter 6. Known Problems and Bugs
-
-   There are some known problems and bugs. If you find any other bugs
-   please mail to lcchang@sis.com.tw
-   
-    1. AM79C901 HomePNA PHY is not thoroughly tested, there may be some
-       bugs in the "on the fly" change of transceiver.
-    2. A bug is hidden somewhere in the receive buffer management code,
-       the bug causes NULL pointer reference in the kernel. This fault is
-       caught before bad things happen and reported with the message:
-       eth0: NULL pointer encountered in Rx ring, skipping which can be
-       viewed with dmesg or cat /var/log/message.
-    3. The media type change from 10Mbps to 100Mbps twisted-pair ethernet
-       by ifconfig causes the media link down.
-     _________________________________________________________________
-   
-Chapter 7. Revision History
-
-     * November 13, 2000, Revision 1.07, seventh release, 630E problem
-       fixed and further clean up.
-     * November 4, 1999, Revision 1.06, Second release, lots of clean up
-       and optimization.
-     * August 8, 1999, Revision 1.05, Initial Public Release
-     _________________________________________________________________
-   
-Chapter 8. Acknowledgements
-
-   This driver was originally derived form Donald Becker's pci-skeleton
-   and rtl8139 drivers. Donald also provided various suggestion regarded
-   with improvements made in revision 1.06.
-   
-   The 1.05 revision was created by Jim Huang, AMD 79c901 support was
-   added by Chin-Shan Li.

+ 2 - 3
arch/ppc/platforms/hdpu.c

@@ -319,11 +319,10 @@ static void __init hdpu_fixup_eth_pdata(struct platform_device *pd)
 	struct mv643xx_eth_platform_data *eth_pd;
 	struct mv643xx_eth_platform_data *eth_pd;
 	eth_pd = pd->dev.platform_data;
 	eth_pd = pd->dev.platform_data;
 
 
-	eth_pd->port_serial_control =
-	    mv64x60_read(&bh, MV643XX_ETH_PORT_SERIAL_CONTROL_REG(pd->id) & ~1);
-
 	eth_pd->force_phy_addr = 1;
 	eth_pd->force_phy_addr = 1;
 	eth_pd->phy_addr = pd->id;
 	eth_pd->phy_addr = pd->id;
+	eth_pd->speed = SPEED_100;
+	eth_pd->duplex = DUPLEX_FULL;
 	eth_pd->tx_queue_size = 400;
 	eth_pd->tx_queue_size = 400;
 	eth_pd->rx_queue_size = 800;
 	eth_pd->rx_queue_size = 800;
 }
 }

+ 15 - 25
arch/sparc/kernel/ioport.c

@@ -217,7 +217,7 @@ static void _sparc_free_io(struct resource *res)
 	unsigned long plen;
 	unsigned long plen;
 
 
 	plen = res->end - res->start + 1;
 	plen = res->end - res->start + 1;
-	if ((plen & (PAGE_SIZE-1)) != 0) BUG();
+	BUG_ON((plen & (PAGE_SIZE-1)) != 0);
 	sparc_unmapiorange(res->start, plen);
 	sparc_unmapiorange(res->start, plen);
 	release_resource(res);
 	release_resource(res);
 }
 }
@@ -512,8 +512,7 @@ void pci_free_consistent(struct pci_dev *pdev, size_t n, void *p, dma_addr_t ba)
 dma_addr_t pci_map_single(struct pci_dev *hwdev, void *ptr, size_t size,
 dma_addr_t pci_map_single(struct pci_dev *hwdev, void *ptr, size_t size,
     int direction)
     int direction)
 {
 {
-	if (direction == PCI_DMA_NONE)
-		BUG();
+	BUG_ON(direction == PCI_DMA_NONE);
 	/* IIep is write-through, not flushing. */
 	/* IIep is write-through, not flushing. */
 	return virt_to_phys(ptr);
 	return virt_to_phys(ptr);
 }
 }
@@ -528,8 +527,7 @@ dma_addr_t pci_map_single(struct pci_dev *hwdev, void *ptr, size_t size,
 void pci_unmap_single(struct pci_dev *hwdev, dma_addr_t ba, size_t size,
 void pci_unmap_single(struct pci_dev *hwdev, dma_addr_t ba, size_t size,
     int direction)
     int direction)
 {
 {
-	if (direction == PCI_DMA_NONE)
-		BUG();
+	BUG_ON(direction == PCI_DMA_NONE);
 	if (direction != PCI_DMA_TODEVICE) {
 	if (direction != PCI_DMA_TODEVICE) {
 		mmu_inval_dma_area((unsigned long)phys_to_virt(ba),
 		mmu_inval_dma_area((unsigned long)phys_to_virt(ba),
 		    (size + PAGE_SIZE-1) & PAGE_MASK);
 		    (size + PAGE_SIZE-1) & PAGE_MASK);
@@ -542,8 +540,7 @@ void pci_unmap_single(struct pci_dev *hwdev, dma_addr_t ba, size_t size,
 dma_addr_t pci_map_page(struct pci_dev *hwdev, struct page *page,
 dma_addr_t pci_map_page(struct pci_dev *hwdev, struct page *page,
 			unsigned long offset, size_t size, int direction)
 			unsigned long offset, size_t size, int direction)
 {
 {
-	if (direction == PCI_DMA_NONE)
-		BUG();
+	BUG_ON(direction == PCI_DMA_NONE);
 	/* IIep is write-through, not flushing. */
 	/* IIep is write-through, not flushing. */
 	return page_to_phys(page) + offset;
 	return page_to_phys(page) + offset;
 }
 }
@@ -551,8 +548,7 @@ dma_addr_t pci_map_page(struct pci_dev *hwdev, struct page *page,
 void pci_unmap_page(struct pci_dev *hwdev,
 void pci_unmap_page(struct pci_dev *hwdev,
 			dma_addr_t dma_address, size_t size, int direction)
 			dma_addr_t dma_address, size_t size, int direction)
 {
 {
-	if (direction == PCI_DMA_NONE)
-		BUG();
+	BUG_ON(direction == PCI_DMA_NONE);
 	/* mmu_inval_dma_area XXX */
 	/* mmu_inval_dma_area XXX */
 }
 }
 
 
@@ -576,11 +572,10 @@ int pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sg, int nents,
 {
 {
 	int n;
 	int n;
 
 
-	if (direction == PCI_DMA_NONE)
-		BUG();
+	BUG_ON(direction == PCI_DMA_NONE);
 	/* IIep is write-through, not flushing. */
 	/* IIep is write-through, not flushing. */
 	for (n = 0; n < nents; n++) {
 	for (n = 0; n < nents; n++) {
-		if (page_address(sg->page) == NULL) BUG();
+		BUG_ON(page_address(sg->page) == NULL);
 		sg->dvma_address = virt_to_phys(page_address(sg->page));
 		sg->dvma_address = virt_to_phys(page_address(sg->page));
 		sg->dvma_length = sg->length;
 		sg->dvma_length = sg->length;
 		sg++;
 		sg++;
@@ -597,11 +592,10 @@ void pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sg, int nents,
 {
 {
 	int n;
 	int n;
 
 
-	if (direction == PCI_DMA_NONE)
-		BUG();
+	BUG_ON(direction == PCI_DMA_NONE);
 	if (direction != PCI_DMA_TODEVICE) {
 	if (direction != PCI_DMA_TODEVICE) {
 		for (n = 0; n < nents; n++) {
 		for (n = 0; n < nents; n++) {
-			if (page_address(sg->page) == NULL) BUG();
+			BUG_ON(page_address(sg->page) == NULL);
 			mmu_inval_dma_area(
 			mmu_inval_dma_area(
 			    (unsigned long) page_address(sg->page),
 			    (unsigned long) page_address(sg->page),
 			    (sg->length + PAGE_SIZE-1) & PAGE_MASK);
 			    (sg->length + PAGE_SIZE-1) & PAGE_MASK);
@@ -622,8 +616,7 @@ void pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sg, int nents,
  */
  */
 void pci_dma_sync_single_for_cpu(struct pci_dev *hwdev, dma_addr_t ba, size_t size, int direction)
 void pci_dma_sync_single_for_cpu(struct pci_dev *hwdev, dma_addr_t ba, size_t size, int direction)
 {
 {
-	if (direction == PCI_DMA_NONE)
-		BUG();
+	BUG_ON(direction == PCI_DMA_NONE);
 	if (direction != PCI_DMA_TODEVICE) {
 	if (direction != PCI_DMA_TODEVICE) {
 		mmu_inval_dma_area((unsigned long)phys_to_virt(ba),
 		mmu_inval_dma_area((unsigned long)phys_to_virt(ba),
 		    (size + PAGE_SIZE-1) & PAGE_MASK);
 		    (size + PAGE_SIZE-1) & PAGE_MASK);
@@ -632,8 +625,7 @@ void pci_dma_sync_single_for_cpu(struct pci_dev *hwdev, dma_addr_t ba, size_t si
 
 
 void pci_dma_sync_single_for_device(struct pci_dev *hwdev, dma_addr_t ba, size_t size, int direction)
 void pci_dma_sync_single_for_device(struct pci_dev *hwdev, dma_addr_t ba, size_t size, int direction)
 {
 {
-	if (direction == PCI_DMA_NONE)
-		BUG();
+	BUG_ON(direction == PCI_DMA_NONE);
 	if (direction != PCI_DMA_TODEVICE) {
 	if (direction != PCI_DMA_TODEVICE) {
 		mmu_inval_dma_area((unsigned long)phys_to_virt(ba),
 		mmu_inval_dma_area((unsigned long)phys_to_virt(ba),
 		    (size + PAGE_SIZE-1) & PAGE_MASK);
 		    (size + PAGE_SIZE-1) & PAGE_MASK);
@@ -650,11 +642,10 @@ void pci_dma_sync_sg_for_cpu(struct pci_dev *hwdev, struct scatterlist *sg, int
 {
 {
 	int n;
 	int n;
 
 
-	if (direction == PCI_DMA_NONE)
-		BUG();
+	BUG_ON(direction == PCI_DMA_NONE);
 	if (direction != PCI_DMA_TODEVICE) {
 	if (direction != PCI_DMA_TODEVICE) {
 		for (n = 0; n < nents; n++) {
 		for (n = 0; n < nents; n++) {
-			if (page_address(sg->page) == NULL) BUG();
+			BUG_ON(page_address(sg->page) == NULL);
 			mmu_inval_dma_area(
 			mmu_inval_dma_area(
 			    (unsigned long) page_address(sg->page),
 			    (unsigned long) page_address(sg->page),
 			    (sg->length + PAGE_SIZE-1) & PAGE_MASK);
 			    (sg->length + PAGE_SIZE-1) & PAGE_MASK);
@@ -667,11 +658,10 @@ void pci_dma_sync_sg_for_device(struct pci_dev *hwdev, struct scatterlist *sg, i
 {
 {
 	int n;
 	int n;
 
 
-	if (direction == PCI_DMA_NONE)
-		BUG();
+	BUG_ON(direction == PCI_DMA_NONE);
 	if (direction != PCI_DMA_TODEVICE) {
 	if (direction != PCI_DMA_TODEVICE) {
 		for (n = 0; n < nents; n++) {
 		for (n = 0; n < nents; n++) {
-			if (page_address(sg->page) == NULL) BUG();
+			BUG_ON(page_address(sg->page) == NULL);
 			mmu_inval_dma_area(
 			mmu_inval_dma_area(
 			    (unsigned long) page_address(sg->page),
 			    (unsigned long) page_address(sg->page),
 			    (sg->length + PAGE_SIZE-1) & PAGE_MASK);
 			    (sg->length + PAGE_SIZE-1) & PAGE_MASK);

+ 18 - 0
arch/sparc64/Kconfig

@@ -186,6 +186,15 @@ endchoice
 
 
 endmenu
 endmenu
 
 
+config ARCH_SPARSEMEM_ENABLE
+	def_bool y
+
+config ARCH_SPARSEMEM_DEFAULT
+	def_bool y
+
+config LARGE_ALLOCS
+	def_bool y
+
 source "mm/Kconfig"
 source "mm/Kconfig"
 
 
 config GENERIC_ISA_DMA
 config GENERIC_ISA_DMA
@@ -350,6 +359,15 @@ config SOLARIS_EMUL
 
 
 endmenu
 endmenu
 
 
+config SCHED_SMT
+	bool "SMT (Hyperthreading) scheduler support"
+	depends on SMP
+	default y
+	help
+	  SMT scheduler support improves the CPU scheduler's decision making
+	  when dealing with UltraSPARC cpus at a cost of slightly increased
+	  overhead in some places. If unsure say N here.
+
 config CMDLINE_BOOL
 config CMDLINE_BOOL
 	bool "Default bootloader kernel arguments"
 	bool "Default bootloader kernel arguments"
 
 

+ 14 - 11
arch/sparc64/defconfig

@@ -1,7 +1,7 @@
 #
 #
 # Automatically generated make config: don't edit
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.16-rc2
-# Tue Feb  7 17:47:18 2006
+# Linux kernel version: 2.6.16
+# Mon Mar 20 01:23:21 2006
 #
 #
 CONFIG_SPARC=y
 CONFIG_SPARC=y
 CONFIG_SPARC64=y
 CONFIG_SPARC64=y
@@ -115,14 +115,20 @@ CONFIG_GENERIC_CALIBRATE_DELAY=y
 CONFIG_HUGETLB_PAGE_SIZE_4MB=y
 CONFIG_HUGETLB_PAGE_SIZE_4MB=y
 # CONFIG_HUGETLB_PAGE_SIZE_512K is not set
 # CONFIG_HUGETLB_PAGE_SIZE_512K is not set
 # CONFIG_HUGETLB_PAGE_SIZE_64K is not set
 # CONFIG_HUGETLB_PAGE_SIZE_64K is not set
+CONFIG_ARCH_SPARSEMEM_ENABLE=y
+CONFIG_ARCH_SPARSEMEM_DEFAULT=y
+CONFIG_LARGE_ALLOCS=y
 CONFIG_SELECT_MEMORY_MODEL=y
 CONFIG_SELECT_MEMORY_MODEL=y
-CONFIG_FLATMEM_MANUAL=y
+# CONFIG_FLATMEM_MANUAL is not set
 # CONFIG_DISCONTIGMEM_MANUAL is not set
 # CONFIG_DISCONTIGMEM_MANUAL is not set
-# CONFIG_SPARSEMEM_MANUAL is not set
-CONFIG_FLATMEM=y
-CONFIG_FLAT_NODE_MEM_MAP=y
+CONFIG_SPARSEMEM_MANUAL=y
+CONFIG_SPARSEMEM=y
+CONFIG_HAVE_MEMORY_PRESENT=y
 # CONFIG_SPARSEMEM_STATIC is not set
 # CONFIG_SPARSEMEM_STATIC is not set
+CONFIG_SPARSEMEM_EXTREME=y
+CONFIG_MEMORY_HOTPLUG=y
 CONFIG_SPLIT_PTLOCK_CPUS=4
 CONFIG_SPLIT_PTLOCK_CPUS=4
+CONFIG_MIGRATION=y
 CONFIG_GENERIC_ISA_DMA=y
 CONFIG_GENERIC_ISA_DMA=y
 CONFIG_SBUS=y
 CONFIG_SBUS=y
 CONFIG_SBUSCHAR=y
 CONFIG_SBUSCHAR=y
@@ -655,6 +661,7 @@ CONFIG_SERIAL_SUNCORE=y
 CONFIG_SERIAL_SUNSU=y
 CONFIG_SERIAL_SUNSU=y
 CONFIG_SERIAL_SUNSU_CONSOLE=y
 CONFIG_SERIAL_SUNSU_CONSOLE=y
 CONFIG_SERIAL_SUNSAB=m
 CONFIG_SERIAL_SUNSAB=m
+CONFIG_SERIAL_SUNHV=y
 CONFIG_SERIAL_CORE=y
 CONFIG_SERIAL_CORE=y
 CONFIG_SERIAL_CORE_CONSOLE=y
 CONFIG_SERIAL_CORE_CONSOLE=y
 # CONFIG_SERIAL_JSM is not set
 # CONFIG_SERIAL_JSM is not set
@@ -1116,11 +1123,7 @@ CONFIG_USB_HIDDEV=y
 # CONFIG_INFINIBAND is not set
 # CONFIG_INFINIBAND is not set
 
 
 #
 #
-# SN Devices
-#
-
-#
-# EDAC - error detection and reporting (RAS)
+# EDAC - error detection and reporting (RAS) (EXPERIMENTAL)
 #
 #
 
 
 #
 #

+ 5 - 3
arch/sparc64/kernel/Makefile

@@ -11,10 +11,12 @@ obj-y		:= process.o setup.o cpu.o idprom.o \
 		   traps.o devices.o auxio.o una_asm.o \
 		   traps.o devices.o auxio.o una_asm.o \
 		   irq.o ptrace.o time.o sys_sparc.o signal.o \
 		   irq.o ptrace.o time.o sys_sparc.o signal.o \
 		   unaligned.o central.o pci.o starfire.o semaphore.o \
 		   unaligned.o central.o pci.o starfire.o semaphore.o \
-		   power.o sbus.o iommu_common.o sparc64_ksyms.o chmc.o
+		   power.o sbus.o iommu_common.o sparc64_ksyms.o chmc.o \
+		   visemul.o
 
 
 obj-$(CONFIG_PCI)	 += ebus.o isa.o pci_common.o pci_iommu.o \
 obj-$(CONFIG_PCI)	 += ebus.o isa.o pci_common.o pci_iommu.o \
-			    pci_psycho.o pci_sabre.o pci_schizo.o
+			    pci_psycho.o pci_sabre.o pci_schizo.o \
+			    pci_sun4v.o pci_sun4v_asm.o
 obj-$(CONFIG_SMP)	 += smp.o trampoline.o
 obj-$(CONFIG_SMP)	 += smp.o trampoline.o
 obj-$(CONFIG_SPARC32_COMPAT) += sys32.o sys_sparc32.o signal32.o
 obj-$(CONFIG_SPARC32_COMPAT) += sys32.o sys_sparc32.o signal32.o
 obj-$(CONFIG_BINFMT_ELF32) += binfmt_elf32.o
 obj-$(CONFIG_BINFMT_ELF32) += binfmt_elf32.o
@@ -38,5 +40,5 @@ else
   CMODEL_CFLAG := -m64 -mcmodel=medlow
   CMODEL_CFLAG := -m64 -mcmodel=medlow
 endif
 endif
 
 
-head.o: head.S ttable.S itlb_base.S dtlb_base.S dtlb_backend.S dtlb_prot.S \
+head.o: head.S ttable.S itlb_miss.S dtlb_miss.S ktlb.S tsb.S \
 	etrap.S rtrap.S winfixup.S entry.S
 	etrap.S rtrap.S winfixup.S entry.S

+ 5 - 9
arch/sparc64/kernel/binfmt_aout32.c

@@ -31,6 +31,7 @@
 #include <asm/system.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/uaccess.h>
 #include <asm/pgalloc.h>
 #include <asm/pgalloc.h>
+#include <asm/mmu_context.h>
 
 
 static int load_aout32_binary(struct linux_binprm *, struct pt_regs * regs);
 static int load_aout32_binary(struct linux_binprm *, struct pt_regs * regs);
 static int load_aout32_library(struct file*);
 static int load_aout32_library(struct file*);
@@ -238,6 +239,8 @@ static int load_aout32_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 		(current->mm->start_data = N_DATADDR(ex));
 		(current->mm->start_data = N_DATADDR(ex));
 	current->mm->brk = ex.a_bss +
 	current->mm->brk = ex.a_bss +
 		(current->mm->start_brk = N_BSSADDR(ex));
 		(current->mm->start_brk = N_BSSADDR(ex));
+	current->mm->free_area_cache = current->mm->mmap_base;
+	current->mm->cached_hole_size = 0;
 
 
 	current->mm->mmap = NULL;
 	current->mm->mmap = NULL;
 	compute_creds(bprm);
 	compute_creds(bprm);
@@ -329,15 +332,8 @@ beyond_if:
 
 
 	current->mm->start_stack =
 	current->mm->start_stack =
 		(unsigned long) create_aout32_tables((char __user *)bprm->p, bprm);
 		(unsigned long) create_aout32_tables((char __user *)bprm->p, bprm);
-	if (!(orig_thr_flags & _TIF_32BIT)) {
-		unsigned long pgd_cache = get_pgd_cache(current->mm->pgd);
-
-		__asm__ __volatile__("stxa\t%0, [%1] %2\n\t"
-				     "membar #Sync"
-				     : /* no outputs */
-				     : "r" (pgd_cache),
-				       "r" (TSB_REG), "i" (ASI_DMMU));
-	}
+	tsb_context_switch(current->mm);
+
 	start_thread32(regs, ex.a_entry, current->mm->start_stack);
 	start_thread32(regs, ex.a_entry, current->mm->start_stack);
 	if (current->ptrace & PT_PTRACED)
 	if (current->ptrace & PT_PTRACED)
 		send_sig(SIGTRAP, current, 0);
 		send_sig(SIGTRAP, current, 0);

+ 3 - 1
arch/sparc64/kernel/binfmt_elf32.c

@@ -153,7 +153,9 @@ MODULE_AUTHOR("Eric Youngdale, David S. Miller, Jakub Jelinek");
 #undef MODULE_DESCRIPTION
 #undef MODULE_DESCRIPTION
 #undef MODULE_AUTHOR
 #undef MODULE_AUTHOR
 
 
+#include <asm/a.out.h>
+
 #undef TASK_SIZE
 #undef TASK_SIZE
-#define TASK_SIZE 0xf0000000
+#define TASK_SIZE STACK_TOP32
 
 
 #include "../../../fs/binfmt_elf.c"
 #include "../../../fs/binfmt_elf.c"

+ 7 - 0
arch/sparc64/kernel/cpu.c

@@ -13,6 +13,7 @@
 #include <asm/system.h>
 #include <asm/system.h>
 #include <asm/fpumacro.h>
 #include <asm/fpumacro.h>
 #include <asm/cpudata.h>
 #include <asm/cpudata.h>
+#include <asm/spitfire.h>
 
 
 DEFINE_PER_CPU(cpuinfo_sparc, __cpu_data) = { 0 };
 DEFINE_PER_CPU(cpuinfo_sparc, __cpu_data) = { 0 };
 
 
@@ -71,6 +72,12 @@ void __init cpu_probe(void)
 	unsigned long ver, fpu_vers, manuf, impl, fprs;
 	unsigned long ver, fpu_vers, manuf, impl, fprs;
 	int i;
 	int i;
 	
 	
+	if (tlb_type == hypervisor) {
+		sparc_cpu_type = "UltraSparc T1 (Niagara)";
+		sparc_fpu_type = "UltraSparc T1 integrated FPU";
+		return;
+	}
+
 	fprs = fprs_read();
 	fprs = fprs_read();
 	fprs_write(FPRS_FEF);
 	fprs_write(FPRS_FEF);
 	__asm__ __volatile__ ("rdpr %%ver, %0; stx %%fsr, [%1]"
 	__asm__ __volatile__ ("rdpr %%ver, %0; stx %%fsr, [%1]"

+ 179 - 10
arch/sparc64/kernel/devices.c

@@ -12,6 +12,7 @@
 #include <linux/string.h>
 #include <linux/string.h>
 #include <linux/spinlock.h>
 #include <linux/spinlock.h>
 #include <linux/errno.h>
 #include <linux/errno.h>
+#include <linux/bootmem.h>
 
 
 #include <asm/page.h>
 #include <asm/page.h>
 #include <asm/oplib.h>
 #include <asm/oplib.h>
@@ -20,6 +21,8 @@
 #include <asm/spitfire.h>
 #include <asm/spitfire.h>
 #include <asm/timer.h>
 #include <asm/timer.h>
 #include <asm/cpudata.h>
 #include <asm/cpudata.h>
+#include <asm/vdev.h>
+#include <asm/irq.h>
 
 
 /* Used to synchronize acceses to NatSemi SUPER I/O chip configure
 /* Used to synchronize acceses to NatSemi SUPER I/O chip configure
  * operations in asm/ns87303.h
  * operations in asm/ns87303.h
@@ -29,13 +32,158 @@ DEFINE_SPINLOCK(ns87303_lock);
 extern void cpu_probe(void);
 extern void cpu_probe(void);
 extern void central_probe(void);
 extern void central_probe(void);
 
 
-static char *cpu_mid_prop(void)
+u32 sun4v_vdev_devhandle;
+int sun4v_vdev_root;
+
+struct vdev_intmap {
+	unsigned int phys;
+	unsigned int irq;
+	unsigned int cnode;
+	unsigned int cinterrupt;
+};
+
+struct vdev_intmask {
+	unsigned int phys;
+	unsigned int interrupt;
+	unsigned int __unused;
+};
+
+static struct vdev_intmap *vdev_intmap;
+static int vdev_num_intmap;
+static struct vdev_intmask vdev_intmask;
+
+static void __init sun4v_virtual_device_probe(void)
+{
+	struct linux_prom64_registers regs;
+	struct vdev_intmap *ip;
+	int node, sz, err;
+
+	if (tlb_type != hypervisor)
+		return;
+
+	node = prom_getchild(prom_root_node);
+	node = prom_searchsiblings(node, "virtual-devices");
+	if (!node) {
+		prom_printf("SUN4V: Fatal error, no virtual-devices node.\n");
+		prom_halt();
+	}
+
+	sun4v_vdev_root = node;
+
+	prom_getproperty(node, "reg", (char *)&regs, sizeof(regs));
+	sun4v_vdev_devhandle = (regs.phys_addr >> 32UL) & 0x0fffffff;
+
+	sz = prom_getproplen(node, "interrupt-map");
+	if (sz <= 0) {
+		prom_printf("SUN4V: Error, no vdev interrupt-map.\n");
+		prom_halt();
+	}
+
+	if ((sz % sizeof(*ip)) != 0) {
+		prom_printf("SUN4V: Bogus interrupt-map property size %d\n",
+			    sz);
+		prom_halt();
+	}
+
+	vdev_intmap = ip = alloc_bootmem_low_pages(sz);
+	if (!vdev_intmap) {
+		prom_printf("SUN4V: Error, cannot allocate vdev_intmap.\n");
+		prom_halt();
+	}
+
+	err = prom_getproperty(node, "interrupt-map", (char *) ip, sz);
+	if (err == -1) {
+		prom_printf("SUN4V: Fatal error, no vdev interrupt-map.\n");
+		prom_halt();
+	}
+	if (err != sz) {
+		prom_printf("SUN4V: Inconsistent interrupt-map size, "
+			    "proplen(%d) vs getprop(%d).\n", sz,err);
+		prom_halt();
+	}
+
+	vdev_num_intmap = err / sizeof(*ip);
+
+	err = prom_getproperty(node, "interrupt-map-mask",
+			       (char *) &vdev_intmask,
+			       sizeof(vdev_intmask));
+	if (err <= 0) {
+		prom_printf("SUN4V: Fatal error, no vdev "
+			    "interrupt-map-mask.\n");
+		prom_halt();
+	}
+	if (err % sizeof(vdev_intmask)) {
+		prom_printf("SUN4V: Bogus interrupt-map-mask "
+			    "property size %d\n", err);
+		prom_halt();
+	}
+
+	printk("SUN4V: virtual-devices devhandle[%x]\n",
+	       sun4v_vdev_devhandle);
+}
+
+unsigned int sun4v_vdev_device_interrupt(unsigned int dev_node)
+{
+	unsigned int irq, reg;
+	int err, i;
+
+	err = prom_getproperty(dev_node, "interrupts",
+			       (char *) &irq, sizeof(irq));
+	if (err <= 0) {
+		printk("VDEV: Cannot get \"interrupts\" "
+		       "property for OBP node %x\n", dev_node);
+		return 0;
+	}
+
+	err = prom_getproperty(dev_node, "reg",
+			       (char *) &reg, sizeof(reg));
+	if (err <= 0) {
+		printk("VDEV: Cannot get \"reg\" "
+		       "property for OBP node %x\n", dev_node);
+		return 0;
+	}
+
+	for (i = 0; i < vdev_num_intmap; i++) {
+		if (vdev_intmap[i].phys == (reg & vdev_intmask.phys) &&
+		    vdev_intmap[i].irq == (irq & vdev_intmask.interrupt)) {
+			irq = vdev_intmap[i].cinterrupt;
+			break;
+		}
+	}
+
+	if (i == vdev_num_intmap) {
+		printk("VDEV: No matching interrupt map entry "
+		       "for OBP node %x\n", dev_node);
+		return 0;
+	}
+
+	return sun4v_build_irq(sun4v_vdev_devhandle, irq, 5, 0);
+}
+
+static const char *cpu_mid_prop(void)
 {
 {
 	if (tlb_type == spitfire)
 	if (tlb_type == spitfire)
 		return "upa-portid";
 		return "upa-portid";
 	return "portid";
 	return "portid";
 }
 }
 
 
+static int get_cpu_mid(int prom_node)
+{
+	if (tlb_type == hypervisor) {
+		struct linux_prom64_registers reg;
+
+		if (prom_getproplen(prom_node, "cpuid") == 4)
+			return prom_getintdefault(prom_node, "cpuid", 0);
+
+		prom_getproperty(prom_node, "reg", (char *) &reg, sizeof(reg));
+		return (reg.phys_addr >> 32) & 0x0fffffffUL;
+	} else {
+		const char *prop_name = cpu_mid_prop();
+
+		return prom_getintdefault(prom_node, prop_name, 0);
+	}
+}
+
 static int check_cpu_node(int nd, int *cur_inst,
 static int check_cpu_node(int nd, int *cur_inst,
 			  int (*compare)(int, int, void *), void *compare_arg,
 			  int (*compare)(int, int, void *), void *compare_arg,
 			  int *prom_node, int *mid)
 			  int *prom_node, int *mid)
@@ -50,7 +198,7 @@ static int check_cpu_node(int nd, int *cur_inst,
 		if (prom_node)
 		if (prom_node)
 			*prom_node = nd;
 			*prom_node = nd;
 		if (mid)
 		if (mid)
-			*mid = prom_getintdefault(nd, cpu_mid_prop(), 0);
+			*mid = get_cpu_mid(nd);
 		return 0;
 		return 0;
 	}
 	}
 
 
@@ -105,7 +253,7 @@ static int cpu_mid_compare(int nd, int instance, void *_arg)
 	int desired_mid = (int) (long) _arg;
 	int desired_mid = (int) (long) _arg;
 	int this_mid;
 	int this_mid;
 
 
-	this_mid = prom_getintdefault(nd, cpu_mid_prop(), 0);
+	this_mid = get_cpu_mid(nd);
 	if (this_mid == desired_mid)
 	if (this_mid == desired_mid)
 		return 0;
 		return 0;
 	return -ENODEV;
 	return -ENODEV;
@@ -126,7 +274,8 @@ void __init device_scan(void)
 
 
 #ifndef CONFIG_SMP
 #ifndef CONFIG_SMP
 	{
 	{
-		int err, cpu_node;
+		int err, cpu_node, def;
+
 		err = cpu_find_by_instance(0, &cpu_node, NULL);
 		err = cpu_find_by_instance(0, &cpu_node, NULL);
 		if (err) {
 		if (err) {
 			prom_printf("No cpu nodes, cannot continue\n");
 			prom_printf("No cpu nodes, cannot continue\n");
@@ -135,21 +284,40 @@ void __init device_scan(void)
 		cpu_data(0).clock_tick = prom_getintdefault(cpu_node,
 		cpu_data(0).clock_tick = prom_getintdefault(cpu_node,
 							    "clock-frequency",
 							    "clock-frequency",
 							    0);
 							    0);
+
+		def = ((tlb_type == hypervisor) ?
+		       (8 * 1024) :
+		       (16 * 1024));
 		cpu_data(0).dcache_size = prom_getintdefault(cpu_node,
 		cpu_data(0).dcache_size = prom_getintdefault(cpu_node,
 							     "dcache-size",
 							     "dcache-size",
-							     16 * 1024);
+							     def);
+
+		def = 32;
 		cpu_data(0).dcache_line_size =
 		cpu_data(0).dcache_line_size =
-			prom_getintdefault(cpu_node, "dcache-line-size", 32);
+			prom_getintdefault(cpu_node, "dcache-line-size",
+					   def);
+
+		def = 16 * 1024;
 		cpu_data(0).icache_size = prom_getintdefault(cpu_node,
 		cpu_data(0).icache_size = prom_getintdefault(cpu_node,
 							     "icache-size",
 							     "icache-size",
-							     16 * 1024);
+							     def);
+
+		def = 32;
 		cpu_data(0).icache_line_size =
 		cpu_data(0).icache_line_size =
-			prom_getintdefault(cpu_node, "icache-line-size", 32);
+			prom_getintdefault(cpu_node, "icache-line-size",
+					   def);
+
+		def = ((tlb_type == hypervisor) ?
+		       (3 * 1024 * 1024) :
+		       (4 * 1024 * 1024));
 		cpu_data(0).ecache_size = prom_getintdefault(cpu_node,
 		cpu_data(0).ecache_size = prom_getintdefault(cpu_node,
 							     "ecache-size",
 							     "ecache-size",
-							     4 * 1024 * 1024);
+							     def);
+
+		def = 64;
 		cpu_data(0).ecache_line_size =
 		cpu_data(0).ecache_line_size =
-			prom_getintdefault(cpu_node, "ecache-line-size", 64);
+			prom_getintdefault(cpu_node, "ecache-line-size",
+					   def);
 		printk("CPU[0]: Caches "
 		printk("CPU[0]: Caches "
 		       "D[sz(%d):line_sz(%d)] "
 		       "D[sz(%d):line_sz(%d)] "
 		       "I[sz(%d):line_sz(%d)] "
 		       "I[sz(%d):line_sz(%d)] "
@@ -160,6 +328,7 @@ void __init device_scan(void)
 	}
 	}
 #endif
 #endif
 
 
+	sun4v_virtual_device_probe();
 	central_probe();
 	central_probe();
 
 
 	cpu_probe();
 	cpu_probe();

+ 0 - 170
arch/sparc64/kernel/dtlb_backend.S

@@ -1,170 +0,0 @@
-/* $Id: dtlb_backend.S,v 1.16 2001/10/09 04:02:11 davem Exp $
- * dtlb_backend.S: Back end to DTLB miss replacement strategy.
- *                 This is included directly into the trap table.
- *
- * Copyright (C) 1996,1998 David S. Miller (davem@redhat.com)
- * Copyright (C) 1997,1998 Jakub Jelinek   (jj@ultra.linux.cz)
- */
-
-#include <asm/pgtable.h>
-#include <asm/mmu.h>
-
-#define VALID_SZ_BITS	(_PAGE_VALID | _PAGE_SZBITS)
-
-#define VPTE_BITS		(_PAGE_CP | _PAGE_CV | _PAGE_P )
-#define VPTE_SHIFT		(PAGE_SHIFT - 3)
-
-/* Ways we can get here:
- *
- * 1) Nucleus loads and stores to/from PA-->VA direct mappings at tl>1.
- * 2) Nucleus loads and stores to/from user/kernel window save areas.
- * 3) VPTE misses from dtlb_base and itlb_base.
- *
- * We need to extract out the PMD and PGDIR indexes from the
- * linear virtual page table access address.  The PTE index
- * is at the bottom, but we are not concerned with it.  Bits
- * 0 to 2 are clear since each PTE is 8 bytes in size.  Each
- * PMD and PGDIR entry are 4 bytes in size.   Thus, this
- * address looks something like:
- *
- * |---------------------------------------------------------------|
- * |  ...   |    PGDIR index    |    PMD index    | PTE index  |   |
- * |---------------------------------------------------------------|
- *   63   F   E               D   C             B   A         3 2 0  <- bit nr
- *
- *  The variable bits above are defined as:
- *  A --> 3 + (PAGE_SHIFT - log2(8))
- *    --> 3 + (PAGE_SHIFT - 3) - 1
- *        (ie. this is "bit 3" + PAGE_SIZE - size of PTE entry in bits - 1)
- *  B --> A + 1
- *  C --> B + (PAGE_SHIFT - log2(4))
- *    -->  B + (PAGE_SHIFT - 2) - 1
- *        (ie. this is "bit B" + PAGE_SIZE - size of PMD entry in bits - 1)
- *  D --> C + 1
- *  E --> D + (PAGE_SHIFT - log2(4))
- *    --> D + (PAGE_SHIFT - 2) - 1
- *        (ie. this is "bit D" + PAGE_SIZE - size of PGDIR entry in bits - 1)
- *  F --> E + 1
- *
- * (Note how "B" always evalutes to PAGE_SHIFT, all the other constants
- *  cancel out.)
- *
- * For 8K PAGE_SIZE (thus, PAGE_SHIFT of 13) the bit numbers are:
- * A --> 12
- * B --> 13
- * C --> 23
- * D --> 24
- * E --> 34
- * F --> 35
- *
- * For 64K PAGE_SIZE (thus, PAGE_SHIFT of 16) the bit numbers are:
- * A --> 15
- * B --> 16
- * C --> 29
- * D --> 30
- * E --> 43
- * F --> 44
- *
- * Because bits both above and below each PGDIR and PMD index need to
- * be masked out, and the index can be as long as 14 bits (when using a
- * 64K PAGE_SIZE, and thus a PAGE_SHIFT of 16), we need 3 instructions
- * to extract each index out.
- *
- * Shifts do not pair very well on UltraSPARC-I, II, IIi, and IIe, so
- * we try to avoid using them for the entire operation.  We could setup
- * a mask anywhere from bit 31 down to bit 10 using the sethi instruction.
- *
- * We need a mask covering bits B --> C and one covering D --> E.
- * For 8K PAGE_SIZE these masks are 0x00ffe000 and 0x7ff000000.
- * For 64K PAGE_SIZE these masks are 0x3fff0000 and 0xfffc0000000.
- * The second in each set cannot be loaded with a single sethi
- * instruction, because the upper bits are past bit 32.  We would
- * need to use a sethi + a shift.
- *
- * For the time being, we use 2 shifts and a simple "and" mask.
- * We shift left to clear the bits above the index, we shift down
- * to clear the bits below the index (sans the log2(4 or 8) bits)
- * and a mask to clear the log2(4 or 8) bits.  We need therefore
- * define 4 shift counts, all of which are relative to PAGE_SHIFT.
- *
- * Although unsupportable for other reasons, this does mean that
- * 512K and 4MB page sizes would be generaally supported by the
- * kernel.  (ELF binaries would break with > 64K PAGE_SIZE since
- * the sections are only aligned that strongly).
- *
- * The operations performed for extraction are thus:
- *
- *      ((X << FOO_SHIFT_LEFT) >> FOO_SHIFT_RIGHT) & ~0x3
- *
- */
-
-#define A (3 + (PAGE_SHIFT - 3) - 1)
-#define B (A + 1)
-#define C (B + (PAGE_SHIFT - 2) - 1)
-#define D (C + 1)
-#define E (D + (PAGE_SHIFT - 2) - 1)
-#define F (E + 1)
-
-#define PMD_SHIFT_LEFT		(64 - D)
-#define PMD_SHIFT_RIGHT		(64 - (D - B) - 2)
-#define PGDIR_SHIFT_LEFT 	(64 - F)
-#define PGDIR_SHIFT_RIGHT	(64 - (F - D) - 2)
-#define LOW_MASK_BITS		0x3
-
-/* TLB1 ** ICACHE line 1: tl1 DTLB and quick VPTE miss	*/
-	ldxa		[%g1 + %g1] ASI_DMMU, %g4	! Get TAG_ACCESS
-	add		%g3, %g3, %g5			! Compute VPTE base
-	cmp		%g4, %g5			! VPTE miss?
-	bgeu,pt		%xcc, 1f			! Continue here
-	 andcc		%g4, TAG_CONTEXT_BITS, %g5	! tl0 miss Nucleus test
-	ba,a,pt		%xcc, from_tl1_trap		! Fall to tl0 miss
-1:	sllx		%g6, VPTE_SHIFT, %g4		! Position TAG_ACCESS
-	or		%g4, %g5, %g4			! Prepare TAG_ACCESS
-
-/* TLB1 ** ICACHE line 2: Quick VPTE miss	  	*/
-	mov		TSB_REG, %g1			! Grab TSB reg
-	ldxa		[%g1] ASI_DMMU, %g5		! Doing PGD caching?
-	sllx		%g6, PMD_SHIFT_LEFT, %g1	! Position PMD offset
-	be,pn		%xcc, sparc64_vpte_nucleus	! Is it from Nucleus?
-	 srlx		%g1, PMD_SHIFT_RIGHT, %g1	! Mask PMD offset bits
-	brnz,pt		%g5, sparc64_vpte_continue	! Yep, go like smoke
-	 andn		%g1, LOW_MASK_BITS, %g1		! Final PMD mask
-	sllx		%g6, PGDIR_SHIFT_LEFT, %g5	! Position PGD offset
-
-/* TLB1 ** ICACHE line 3: Quick VPTE miss	  	*/
-	srlx		%g5, PGDIR_SHIFT_RIGHT, %g5	! Mask PGD offset bits
-	andn		%g5, LOW_MASK_BITS, %g5		! Final PGD mask
-	lduwa		[%g7 + %g5] ASI_PHYS_USE_EC, %g5! Load PGD
-	brz,pn		%g5, vpte_noent			! Valid?
-sparc64_kpte_continue:
-	 sllx		%g5, 11, %g5			! Shift into place
-sparc64_vpte_continue:
-	lduwa		[%g5 + %g1] ASI_PHYS_USE_EC, %g5! Load PMD
-	sllx		%g5, 11, %g5			! Shift into place
-	brz,pn		%g5, vpte_noent			! Valid?
-
-/* TLB1 ** ICACHE line 4: Quick VPTE miss	  	*/
-	 mov		(VALID_SZ_BITS >> 61), %g1	! upper vpte into %g1
-	sllx		%g1, 61, %g1			! finish calc
-	or		%g5, VPTE_BITS, %g5		! Prepare VPTE data
-	or		%g5, %g1, %g5			! ...
-	mov		TLB_SFSR, %g1			! Restore %g1 value
-	stxa		%g5, [%g0] ASI_DTLB_DATA_IN	! Load VPTE into TLB
-	stxa		%g4, [%g1 + %g1] ASI_DMMU	! Restore previous TAG_ACCESS
-	retry						! Load PTE once again
-
-#undef VALID_SZ_BITS
-#undef VPTE_SHIFT
-#undef VPTE_BITS
-#undef A
-#undef B
-#undef C
-#undef D
-#undef E
-#undef F
-#undef PMD_SHIFT_LEFT
-#undef PMD_SHIFT_RIGHT
-#undef PGDIR_SHIFT_LEFT
-#undef PGDIR_SHIFT_RIGHT
-#undef LOW_MASK_BITS
-

+ 0 - 109
arch/sparc64/kernel/dtlb_base.S

@@ -1,109 +0,0 @@
-/* $Id: dtlb_base.S,v 1.17 2001/10/11 22:33:52 davem Exp $
- * dtlb_base.S:	Front end to DTLB miss replacement strategy.
- *              This is included directly into the trap table.
- *
- * Copyright (C) 1996,1998 David S. Miller (davem@redhat.com)
- * Copyright (C) 1997,1998 Jakub Jelinek   (jj@ultra.linux.cz)
- */
-
-#include <asm/pgtable.h>
-#include <asm/mmu.h>
-
-/* %g1	TLB_SFSR	(%g1 + %g1 == TLB_TAG_ACCESS)
- * %g2	(KERN_HIGHBITS | KERN_LOWBITS)
- * %g3  VPTE base	(0xfffffffe00000000)	Spitfire/Blackbird (44-bit VA space)
- *			(0xffe0000000000000)	Cheetah		   (64-bit VA space)
- * %g7	__pa(current->mm->pgd)
- *
- * The VPTE base value is completely magic, but note that
- * few places in the kernel other than these TLB miss
- * handlers know anything about the VPTE mechanism or
- * how it works (see VPTE_SIZE, TASK_SIZE and PTRS_PER_PGD).
- * Consider the 44-bit VADDR Ultra-I/II case as an example:
- *
- * VA[0 :  (1<<43)] produce VPTE index [%g3                        :   0]
- * VA[0 : -(1<<43)] produce VPTE index [%g3-(1<<(43-PAGE_SHIFT+3)) : %g3]
- *
- * For Cheetah's 64-bit VADDR space this is:
- *
- * VA[0 :  (1<<63)] produce VPTE index [%g3                        :   0]
- * VA[0 : -(1<<63)] produce VPTE index [%g3-(1<<(63-PAGE_SHIFT+3)) : %g3]
- *
- * If you're paying attention you'll notice that this means half of
- * the VPTE table is above %g3 and half is below, low VA addresses
- * map progressively upwards from %g3, and high VA addresses map
- * progressively upwards towards %g3.  This trick was needed to make
- * the same 8 instruction handler work both for Spitfire/Blackbird's
- * peculiar VA space hole configuration and the full 64-bit VA space
- * one of Cheetah at the same time.
- */
-
-/* Ways we can get here:
- *
- * 1) Nucleus loads and stores to/from PA-->VA direct mappings.
- * 2) Nucleus loads and stores to/from vmalloc() areas.
- * 3) User loads and stores.
- * 4) User space accesses by nucleus at tl0
- */
-
-#if PAGE_SHIFT == 13
-/*
- * To compute vpte offset, we need to do ((addr >> 13) << 3),
- * which can be optimized to (addr >> 10) if bits 10/11/12 can
- * be guaranteed to be 0 ... mmu_context.h does guarantee this
- * by only using 10 bits in the hwcontext value.
- */
-#define CREATE_VPTE_OFFSET1(r1, r2) nop
-#define CREATE_VPTE_OFFSET2(r1, r2) \
-				srax	r1, 10, r2
-#else
-#define CREATE_VPTE_OFFSET1(r1, r2) \
-				srax	r1, PAGE_SHIFT, r2
-#define CREATE_VPTE_OFFSET2(r1, r2) \
-				sllx	r2, 3, r2
-#endif
-
-/* DTLB ** ICACHE line 1: Quick user TLB misses		*/
-	mov		TLB_SFSR, %g1
-	ldxa		[%g1 + %g1] ASI_DMMU, %g4	! Get TAG_ACCESS
-	andcc		%g4, TAG_CONTEXT_BITS, %g0	! From Nucleus?
-from_tl1_trap:
-	rdpr		%tl, %g5			! For TL==3 test
-	CREATE_VPTE_OFFSET1(%g4, %g6)			! Create VPTE offset
-	be,pn		%xcc, kvmap			! Yep, special processing
-	 CREATE_VPTE_OFFSET2(%g4, %g6)			! Create VPTE offset
-	cmp		%g5, 4				! Last trap level?
-
-/* DTLB ** ICACHE line 2: User finish + quick kernel TLB misses	*/
-	be,pn		%xcc, longpath			! Yep, cannot risk VPTE miss
-	 nop						! delay slot
-	ldxa		[%g3 + %g6] ASI_S, %g5		! Load VPTE
-1:	brgez,pn	%g5, longpath			! Invalid, branch out
-	 nop						! Delay-slot
-9:	stxa		%g5, [%g0] ASI_DTLB_DATA_IN	! Reload TLB
-	retry						! Trap return
-	nop
-
-/* DTLB ** ICACHE line 3: winfixups+real_faults		*/
-longpath:
-	rdpr		%pstate, %g5			! Move into alternate globals
-	wrpr		%g5, PSTATE_AG|PSTATE_MG, %pstate
-	rdpr		%tl, %g4			! See where we came from.
-	cmp		%g4, 1				! Is etrap/rtrap window fault?
-	mov		TLB_TAG_ACCESS, %g4		! Prepare for fault processing
-	ldxa		[%g4] ASI_DMMU, %g5		! Load faulting VA page
-	be,pt		%xcc, sparc64_realfault_common	! Jump to normal fault handling
-	 mov		FAULT_CODE_DTLB, %g4		! It was read from DTLB
-
-/* DTLB ** ICACHE line 4: Unused...	*/
-	ba,a,pt		%xcc, winfix_trampoline		! Call window fixup code
-	nop
-	nop
-	nop
-	nop
-	nop
-	nop
-	nop
-
-#undef CREATE_VPTE_OFFSET1
-#undef CREATE_VPTE_OFFSET2

+ 39 - 0
arch/sparc64/kernel/dtlb_miss.S

@@ -0,0 +1,39 @@
+/* DTLB ** ICACHE line 1: Context 0 check and TSB load	*/
+	ldxa	[%g0] ASI_DMMU_TSB_8KB_PTR, %g1	! Get TSB 8K pointer
+	ldxa	[%g0] ASI_DMMU, %g6		! Get TAG TARGET
+	srlx	%g6, 48, %g5			! Get context
+	sllx	%g6, 22, %g6			! Zero out context
+	brz,pn	%g5, kvmap_dtlb			! Context 0 processing
+	 srlx	%g6, 22, %g6			! Delay slot
+	TSB_LOAD_QUAD(%g1, %g4)			! Load TSB entry
+	cmp	%g4, %g6			! Compare TAG
+
+/* DTLB ** ICACHE line 2: TSB compare and TLB load	*/
+	bne,pn	%xcc, tsb_miss_dtlb		! Miss
+	 mov	FAULT_CODE_DTLB, %g3
+	stxa	%g5, [%g0] ASI_DTLB_DATA_IN	! Load TLB
+	retry					! Trap done
+	nop
+	nop
+	nop
+	nop
+
+/* DTLB ** ICACHE line 3:				*/
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+
+/* DTLB ** ICACHE line 4: 				*/
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop

+ 1 - 2
arch/sparc64/kernel/ebus.c

@@ -277,10 +277,9 @@ static inline void *ebus_alloc(size_t size)
 {
 {
 	void *mem;
 	void *mem;
 
 
-	mem = kmalloc(size, GFP_ATOMIC);
+	mem = kzalloc(size, GFP_ATOMIC);
 	if (!mem)
 	if (!mem)
 		panic("ebus_alloc: out of memory");
 		panic("ebus_alloc: out of memory");
-	memset((char *)mem, 0, size);
 	return mem;
 	return mem;
 }
 }
 
 

+ 232 - 99
arch/sparc64/kernel/entry.S

@@ -50,7 +50,8 @@ do_fpdis:
 	add		%g0, %g0, %g0
 	add		%g0, %g0, %g0
 	ba,a,pt		%xcc, rtrap_clr_l6
 	ba,a,pt		%xcc, rtrap_clr_l6
 
 
-1:	ldub		[%g6 + TI_FPSAVED], %g5
+1:	TRAP_LOAD_THREAD_REG(%g6, %g1)
+	ldub		[%g6 + TI_FPSAVED], %g5
 	wr		%g0, FPRS_FEF, %fprs
 	wr		%g0, FPRS_FEF, %fprs
 	andcc		%g5, FPRS_FEF, %g0
 	andcc		%g5, FPRS_FEF, %g0
 	be,a,pt		%icc, 1f
 	be,a,pt		%icc, 1f
@@ -96,10 +97,22 @@ do_fpdis:
 	add		%g6, TI_FPREGS + 0x80, %g1
 	add		%g6, TI_FPREGS + 0x80, %g1
 	faddd		%f0, %f2, %f4
 	faddd		%f0, %f2, %f4
 	fmuld		%f0, %f2, %f6
 	fmuld		%f0, %f2, %f6
-	ldxa		[%g3] ASI_DMMU, %g5
+
+661:	ldxa		[%g3] ASI_DMMU, %g5
+	.section	.sun4v_1insn_patch, "ax"
+	.word		661b
+	ldxa		[%g3] ASI_MMU, %g5
+	.previous
+
 	sethi		%hi(sparc64_kern_sec_context), %g2
 	sethi		%hi(sparc64_kern_sec_context), %g2
 	ldx		[%g2 + %lo(sparc64_kern_sec_context)], %g2
 	ldx		[%g2 + %lo(sparc64_kern_sec_context)], %g2
-	stxa		%g2, [%g3] ASI_DMMU
+
+661:	stxa		%g2, [%g3] ASI_DMMU
+	.section	.sun4v_1insn_patch, "ax"
+	.word		661b
+	stxa		%g2, [%g3] ASI_MMU
+	.previous
+
 	membar		#Sync
 	membar		#Sync
 	add		%g6, TI_FPREGS + 0xc0, %g2
 	add		%g6, TI_FPREGS + 0xc0, %g2
 	faddd		%f0, %f2, %f8
 	faddd		%f0, %f2, %f8
@@ -125,11 +138,23 @@ do_fpdis:
 	 fzero		%f32
 	 fzero		%f32
 	mov		SECONDARY_CONTEXT, %g3
 	mov		SECONDARY_CONTEXT, %g3
 	fzero		%f34
 	fzero		%f34
-	ldxa		[%g3] ASI_DMMU, %g5
+
+661:	ldxa		[%g3] ASI_DMMU, %g5
+	.section	.sun4v_1insn_patch, "ax"
+	.word		661b
+	ldxa		[%g3] ASI_MMU, %g5
+	.previous
+
 	add		%g6, TI_FPREGS, %g1
 	add		%g6, TI_FPREGS, %g1
 	sethi		%hi(sparc64_kern_sec_context), %g2
 	sethi		%hi(sparc64_kern_sec_context), %g2
 	ldx		[%g2 + %lo(sparc64_kern_sec_context)], %g2
 	ldx		[%g2 + %lo(sparc64_kern_sec_context)], %g2
-	stxa		%g2, [%g3] ASI_DMMU
+
+661:	stxa		%g2, [%g3] ASI_DMMU
+	.section	.sun4v_1insn_patch, "ax"
+	.word		661b
+	stxa		%g2, [%g3] ASI_MMU
+	.previous
+
 	membar		#Sync
 	membar		#Sync
 	add		%g6, TI_FPREGS + 0x40, %g2
 	add		%g6, TI_FPREGS + 0x40, %g2
 	faddd		%f32, %f34, %f36
 	faddd		%f32, %f34, %f36
@@ -154,10 +179,22 @@ do_fpdis:
 	 nop
 	 nop
 3:	mov		SECONDARY_CONTEXT, %g3
 3:	mov		SECONDARY_CONTEXT, %g3
 	add		%g6, TI_FPREGS, %g1
 	add		%g6, TI_FPREGS, %g1
-	ldxa		[%g3] ASI_DMMU, %g5
+
+661:	ldxa		[%g3] ASI_DMMU, %g5
+	.section	.sun4v_1insn_patch, "ax"
+	.word		661b
+	ldxa		[%g3] ASI_MMU, %g5
+	.previous
+
 	sethi		%hi(sparc64_kern_sec_context), %g2
 	sethi		%hi(sparc64_kern_sec_context), %g2
 	ldx		[%g2 + %lo(sparc64_kern_sec_context)], %g2
 	ldx		[%g2 + %lo(sparc64_kern_sec_context)], %g2
-	stxa		%g2, [%g3] ASI_DMMU
+
+661:	stxa		%g2, [%g3] ASI_DMMU
+	.section	.sun4v_1insn_patch, "ax"
+	.word		661b
+	stxa		%g2, [%g3] ASI_MMU
+	.previous
+
 	membar		#Sync
 	membar		#Sync
 	mov		0x40, %g2
 	mov		0x40, %g2
 	membar		#Sync
 	membar		#Sync
@@ -168,7 +205,13 @@ do_fpdis:
 	ldda		[%g1 + %g2] ASI_BLK_S, %f48
 	ldda		[%g1 + %g2] ASI_BLK_S, %f48
 	membar		#Sync
 	membar		#Sync
 fpdis_exit:
 fpdis_exit:
-	stxa		%g5, [%g3] ASI_DMMU
+
+661:	stxa		%g5, [%g3] ASI_DMMU
+	.section	.sun4v_1insn_patch, "ax"
+	.word		661b
+	stxa		%g5, [%g3] ASI_MMU
+	.previous
+
 	membar		#Sync
 	membar		#Sync
 fpdis_exit2:
 fpdis_exit2:
 	wr		%g7, 0, %gsr
 	wr		%g7, 0, %gsr
@@ -189,6 +232,7 @@ fp_other_bounce:
 	.globl		do_fpother_check_fitos
 	.globl		do_fpother_check_fitos
 	.align		32
 	.align		32
 do_fpother_check_fitos:
 do_fpother_check_fitos:
+	TRAP_LOAD_THREAD_REG(%g6, %g1)
 	sethi		%hi(fp_other_bounce - 4), %g7
 	sethi		%hi(fp_other_bounce - 4), %g7
 	or		%g7, %lo(fp_other_bounce - 4), %g7
 	or		%g7, %lo(fp_other_bounce - 4), %g7
 
 
@@ -312,6 +356,7 @@ fitos_emul_fini:
 	.globl		do_fptrap
 	.globl		do_fptrap
 	.align		32
 	.align		32
 do_fptrap:
 do_fptrap:
+	TRAP_LOAD_THREAD_REG(%g6, %g1)
 	stx		%fsr, [%g6 + TI_XFSR]
 	stx		%fsr, [%g6 + TI_XFSR]
 do_fptrap_after_fsr:
 do_fptrap_after_fsr:
 	ldub		[%g6 + TI_FPSAVED], %g3
 	ldub		[%g6 + TI_FPSAVED], %g3
@@ -321,10 +366,22 @@ do_fptrap_after_fsr:
 	rd		%gsr, %g3
 	rd		%gsr, %g3
 	stx		%g3, [%g6 + TI_GSR]
 	stx		%g3, [%g6 + TI_GSR]
 	mov		SECONDARY_CONTEXT, %g3
 	mov		SECONDARY_CONTEXT, %g3
-	ldxa		[%g3] ASI_DMMU, %g5
+
+661:	ldxa		[%g3] ASI_DMMU, %g5
+	.section	.sun4v_1insn_patch, "ax"
+	.word		661b
+	ldxa		[%g3] ASI_MMU, %g5
+	.previous
+
 	sethi		%hi(sparc64_kern_sec_context), %g2
 	sethi		%hi(sparc64_kern_sec_context), %g2
 	ldx		[%g2 + %lo(sparc64_kern_sec_context)], %g2
 	ldx		[%g2 + %lo(sparc64_kern_sec_context)], %g2
-	stxa		%g2, [%g3] ASI_DMMU
+
+661:	stxa		%g2, [%g3] ASI_DMMU
+	.section	.sun4v_1insn_patch, "ax"
+	.word		661b
+	stxa		%g2, [%g3] ASI_MMU
+	.previous
+
 	membar		#Sync
 	membar		#Sync
 	add		%g6, TI_FPREGS, %g2
 	add		%g6, TI_FPREGS, %g2
 	andcc		%g1, FPRS_DL, %g0
 	andcc		%g1, FPRS_DL, %g0
@@ -339,7 +396,13 @@ do_fptrap_after_fsr:
 	stda		%f48, [%g2 + %g3] ASI_BLK_S
 	stda		%f48, [%g2 + %g3] ASI_BLK_S
 5:	mov		SECONDARY_CONTEXT, %g1
 5:	mov		SECONDARY_CONTEXT, %g1
 	membar		#Sync
 	membar		#Sync
-	stxa		%g5, [%g1] ASI_DMMU
+
+661:	stxa		%g5, [%g1] ASI_DMMU
+	.section	.sun4v_1insn_patch, "ax"
+	.word		661b
+	stxa		%g5, [%g1] ASI_MMU
+	.previous
+
 	membar		#Sync
 	membar		#Sync
 	ba,pt		%xcc, etrap
 	ba,pt		%xcc, etrap
 	 wr		%g0, 0, %fprs
 	 wr		%g0, 0, %fprs
@@ -353,8 +416,6 @@ do_fptrap_after_fsr:
 	 *
 	 *
 	 * With this method we can do most of the cross-call tlb/cache
 	 * With this method we can do most of the cross-call tlb/cache
 	 * flushing very quickly.
 	 * flushing very quickly.
-	 *
-	 * Current CPU's IRQ worklist table is locked into %g6, don't touch.
 	 */
 	 */
 	.text
 	.text
 	.align		32
 	.align		32
@@ -378,6 +439,8 @@ do_ivec:
 	sllx		%g2, %g4, %g2
 	sllx		%g2, %g4, %g2
 	sllx		%g4, 2, %g4
 	sllx		%g4, 2, %g4
 
 
+	TRAP_LOAD_IRQ_WORK(%g6, %g1)
+
 	lduw		[%g6 + %g4], %g5	/* g5 = irq_work(cpu, pil) */
 	lduw		[%g6 + %g4], %g5	/* g5 = irq_work(cpu, pil) */
 	stw		%g5, [%g3 + 0x00]	/* bucket->irq_chain = g5 */
 	stw		%g5, [%g3 + 0x00]	/* bucket->irq_chain = g5 */
 	stw		%g3, [%g6 + %g4]	/* irq_work(cpu, pil) = bucket */
 	stw		%g3, [%g6 + %g4]	/* irq_work(cpu, pil) = bucket */
@@ -399,76 +462,6 @@ do_ivec_xcall:
 1:	jmpl		%g3, %g0
 1:	jmpl		%g3, %g0
 	 nop
 	 nop
 
 
-	.globl		save_alternate_globals
-save_alternate_globals: /* %o0 = save_area */
-	rdpr		%pstate, %o5
-	andn		%o5, PSTATE_IE, %o1
-	wrpr		%o1, PSTATE_AG, %pstate
-	stx		%g0, [%o0 + 0x00]
-	stx		%g1, [%o0 + 0x08]
-	stx		%g2, [%o0 + 0x10]
-	stx		%g3, [%o0 + 0x18]
-	stx		%g4, [%o0 + 0x20]
-	stx		%g5, [%o0 + 0x28]
-	stx		%g6, [%o0 + 0x30]
-	stx		%g7, [%o0 + 0x38]
-	wrpr		%o1, PSTATE_IG, %pstate
-	stx		%g0, [%o0 + 0x40]
-	stx		%g1, [%o0 + 0x48]
-	stx		%g2, [%o0 + 0x50]
-	stx		%g3, [%o0 + 0x58]
-	stx		%g4, [%o0 + 0x60]
-	stx		%g5, [%o0 + 0x68]
-	stx		%g6, [%o0 + 0x70]
-	stx		%g7, [%o0 + 0x78]
-	wrpr		%o1, PSTATE_MG, %pstate
-	stx		%g0, [%o0 + 0x80]
-	stx		%g1, [%o0 + 0x88]
-	stx		%g2, [%o0 + 0x90]
-	stx		%g3, [%o0 + 0x98]
-	stx		%g4, [%o0 + 0xa0]
-	stx		%g5, [%o0 + 0xa8]
-	stx		%g6, [%o0 + 0xb0]
-	stx		%g7, [%o0 + 0xb8]
-	wrpr		%o5, 0x0, %pstate
-	retl
-	 nop
-
-	.globl		restore_alternate_globals
-restore_alternate_globals: /* %o0 = save_area */
-	rdpr		%pstate, %o5
-	andn		%o5, PSTATE_IE, %o1
-	wrpr		%o1, PSTATE_AG, %pstate
-	ldx		[%o0 + 0x00], %g0
-	ldx		[%o0 + 0x08], %g1
-	ldx		[%o0 + 0x10], %g2
-	ldx		[%o0 + 0x18], %g3
-	ldx		[%o0 + 0x20], %g4
-	ldx		[%o0 + 0x28], %g5
-	ldx		[%o0 + 0x30], %g6
-	ldx		[%o0 + 0x38], %g7
-	wrpr		%o1, PSTATE_IG, %pstate
-	ldx		[%o0 + 0x40], %g0
-	ldx		[%o0 + 0x48], %g1
-	ldx		[%o0 + 0x50], %g2
-	ldx		[%o0 + 0x58], %g3
-	ldx		[%o0 + 0x60], %g4
-	ldx		[%o0 + 0x68], %g5
-	ldx		[%o0 + 0x70], %g6
-	ldx		[%o0 + 0x78], %g7
-	wrpr		%o1, PSTATE_MG, %pstate
-	ldx		[%o0 + 0x80], %g0
-	ldx		[%o0 + 0x88], %g1
-	ldx		[%o0 + 0x90], %g2
-	ldx		[%o0 + 0x98], %g3
-	ldx		[%o0 + 0xa0], %g4
-	ldx		[%o0 + 0xa8], %g5
-	ldx		[%o0 + 0xb0], %g6
-	ldx		[%o0 + 0xb8], %g7
-	wrpr		%o5, 0x0, %pstate
-	retl
-	 nop
-
 	.globl		getcc, setcc
 	.globl		getcc, setcc
 getcc:
 getcc:
 	ldx		[%o0 + PT_V9_TSTATE], %o1
 	ldx		[%o0 + PT_V9_TSTATE], %o1
@@ -488,9 +481,24 @@ setcc:
 	retl
 	retl
 	 stx		%o1, [%o0 + PT_V9_TSTATE]
 	 stx		%o1, [%o0 + PT_V9_TSTATE]
 
 
-	.globl		utrap, utrap_ill
-utrap:	brz,pn		%g1, etrap
+	.globl		utrap_trap
+utrap_trap:		/* %g3=handler,%g4=level */
+	TRAP_LOAD_THREAD_REG(%g6, %g1)
+	ldx		[%g6 + TI_UTRAPS], %g1
+	brnz,pt		%g1, invoke_utrap
 	 nop
 	 nop
+
+	ba,pt		%xcc, etrap
+	 rd		%pc, %g7
+	mov		%l4, %o1
+        call		bad_trap
+	 add		%sp, PTREGS_OFF, %o0
+	ba,pt		%xcc, rtrap
+	 clr		%l6
+
+invoke_utrap:
+	sllx		%g3, 3, %g3
+	ldx		[%g1 + %g3], %g1
 	save		%sp, -128, %sp
 	save		%sp, -128, %sp
 	rdpr		%tstate, %l6
 	rdpr		%tstate, %l6
 	rdpr		%cwp, %l7
 	rdpr		%cwp, %l7
@@ -500,17 +508,6 @@ utrap:	brz,pn		%g1, etrap
 	rdpr		%tnpc, %l7
 	rdpr		%tnpc, %l7
 	wrpr		%g1, 0, %tnpc
 	wrpr		%g1, 0, %tnpc
 	done
 	done
-utrap_ill:
-        call		bad_trap
-	 add		%sp, PTREGS_OFF, %o0
-	ba,pt		%xcc, rtrap
-	 clr		%l6
-
-	/* XXX Here is stuff we still need to write... -DaveM XXX */
-	.globl		netbsd_syscall
-netbsd_syscall:
-	retl
-	 nop
 
 
 	/* We need to carefully read the error status, ACK
 	/* We need to carefully read the error status, ACK
 	 * the errors, prevent recursive traps, and pass the
 	 * the errors, prevent recursive traps, and pass the
@@ -1001,7 +998,7 @@ dcpe_icpe_tl1_common:
 	 * %g3:		scratch
 	 * %g3:		scratch
 	 * %g4:		AFSR
 	 * %g4:		AFSR
 	 * %g5:		AFAR
 	 * %g5:		AFAR
-	 * %g6:		current thread ptr
+	 * %g6:		unused, will have current thread ptr after etrap
 	 * %g7:		scratch
 	 * %g7:		scratch
 	 */
 	 */
 __cheetah_log_error:
 __cheetah_log_error:
@@ -1539,13 +1536,14 @@ ret_from_syscall:
 
 
 1:		b,pt		%xcc, ret_sys_call
 1:		b,pt		%xcc, ret_sys_call
 		 ldx		[%sp + PTREGS_OFF + PT_V9_I0], %o0
 		 ldx		[%sp + PTREGS_OFF + PT_V9_I0], %o0
-sparc_exit:	wrpr		%g0, (PSTATE_RMO | PSTATE_PEF | PSTATE_PRIV), %pstate
+sparc_exit:	rdpr		%pstate, %g2
+		wrpr		%g2, PSTATE_IE, %pstate
 		rdpr		%otherwin, %g1
 		rdpr		%otherwin, %g1
 		rdpr		%cansave, %g3
 		rdpr		%cansave, %g3
 		add		%g3, %g1, %g3
 		add		%g3, %g1, %g3
 		wrpr		%g3, 0x0, %cansave
 		wrpr		%g3, 0x0, %cansave
 		wrpr		%g0, 0x0, %otherwin
 		wrpr		%g0, 0x0, %otherwin
-		wrpr		%g0, (PSTATE_RMO | PSTATE_PEF | PSTATE_PRIV | PSTATE_IE), %pstate
+		wrpr		%g2, 0x0, %pstate
 		ba,pt		%xcc, sys_exit
 		ba,pt		%xcc, sys_exit
 		 stb		%g0, [%g6 + TI_WSAVED]
 		 stb		%g0, [%g6 + TI_WSAVED]
 
 
@@ -1690,3 +1688,138 @@ __flushw_user:
 	 restore	%g0, %g0, %g0
 	 restore	%g0, %g0, %g0
 2:	retl
 2:	retl
 	 nop
 	 nop
+
+#ifdef CONFIG_SMP
+	.globl		hard_smp_processor_id
+hard_smp_processor_id:
+#endif
+	.globl		real_hard_smp_processor_id
+real_hard_smp_processor_id:
+	__GET_CPUID(%o0)
+	retl
+	 nop
+
+	/* %o0: devhandle
+	 * %o1:	devino
+	 *
+	 * returns %o0: sysino
+	 */
+	.globl	sun4v_devino_to_sysino
+sun4v_devino_to_sysino:
+	mov	HV_FAST_INTR_DEVINO2SYSINO, %o5
+	ta	HV_FAST_TRAP
+	retl
+	 mov	%o1, %o0
+
+	/* %o0: sysino
+	 *
+	 * returns %o0: intr_enabled (HV_INTR_{DISABLED,ENABLED})
+	 */
+	.globl	sun4v_intr_getenabled
+sun4v_intr_getenabled:
+	mov	HV_FAST_INTR_GETENABLED, %o5
+	ta	HV_FAST_TRAP
+	retl
+	 mov	%o1, %o0
+
+	/* %o0: sysino
+	 * %o1: intr_enabled (HV_INTR_{DISABLED,ENABLED})
+	 */
+	.globl	sun4v_intr_setenabled
+sun4v_intr_setenabled:
+	mov	HV_FAST_INTR_SETENABLED, %o5
+	ta	HV_FAST_TRAP
+	retl
+	 nop
+
+	/* %o0: sysino
+	 *
+	 * returns %o0: intr_state (HV_INTR_STATE_*)
+	 */
+	.globl	sun4v_intr_getstate
+sun4v_intr_getstate:
+	mov	HV_FAST_INTR_GETSTATE, %o5
+	ta	HV_FAST_TRAP
+	retl
+	 mov	%o1, %o0
+
+	/* %o0: sysino
+	 * %o1: intr_state (HV_INTR_STATE_*)
+	 */
+	.globl	sun4v_intr_setstate
+sun4v_intr_setstate:
+	mov	HV_FAST_INTR_SETSTATE, %o5
+	ta	HV_FAST_TRAP
+	retl
+	 nop
+
+	/* %o0: sysino
+	 *
+	 * returns %o0: cpuid
+	 */
+	.globl	sun4v_intr_gettarget
+sun4v_intr_gettarget:
+	mov	HV_FAST_INTR_GETTARGET, %o5
+	ta	HV_FAST_TRAP
+	retl
+	 mov	%o1, %o0
+
+	/* %o0: sysino
+	 * %o1: cpuid
+	 */
+	.globl	sun4v_intr_settarget
+sun4v_intr_settarget:
+	mov	HV_FAST_INTR_SETTARGET, %o5
+	ta	HV_FAST_TRAP
+	retl
+	 nop
+
+	/* %o0:	type
+	 * %o1:	queue paddr
+	 * %o2:	num queue entries
+	 *
+	 * returns %o0:	status
+	 */
+	.globl	sun4v_cpu_qconf
+sun4v_cpu_qconf:
+	mov	HV_FAST_CPU_QCONF, %o5
+	ta	HV_FAST_TRAP
+	retl
+	 nop
+
+	/* returns %o0:	status
+	 */
+	.globl	sun4v_cpu_yield
+sun4v_cpu_yield:
+	mov	HV_FAST_CPU_YIELD, %o5
+	ta	HV_FAST_TRAP
+	retl
+	 nop
+
+	/* %o0:	num cpus in cpu list
+	 * %o1:	cpu list paddr
+	 * %o2:	mondo block paddr
+	 *
+	 * returns %o0: status
+	 */
+	.globl	sun4v_cpu_mondo_send
+sun4v_cpu_mondo_send:
+	mov	HV_FAST_CPU_MONDO_SEND, %o5
+	ta	HV_FAST_TRAP
+	retl
+	 nop
+
+	/* %o0:	CPU ID
+	 *
+	 * returns %o0:	-status if status non-zero, else
+	 *         %o0:	cpu state as HV_CPU_STATE_*
+	 */
+	.globl	sun4v_cpu_state
+sun4v_cpu_state:
+	mov	HV_FAST_CPU_STATE, %o5
+	ta	HV_FAST_TRAP
+	brnz,pn	%o0, 1f
+	 sub	%g0, %o0, %o0
+	mov	%o1, %o0
+1:	retl
+	 nop

+ 72 - 98
arch/sparc64/kernel/etrap.S

@@ -31,6 +31,7 @@
 		.globl	etrap, etrap_irq, etraptl1
 		.globl	etrap, etrap_irq, etraptl1
 etrap:		rdpr	%pil, %g2
 etrap:		rdpr	%pil, %g2
 etrap_irq:
 etrap_irq:
+		TRAP_LOAD_THREAD_REG(%g6, %g1)
 		rdpr	%tstate, %g1
 		rdpr	%tstate, %g1
 		sllx	%g2, 20, %g3
 		sllx	%g2, 20, %g3
 		andcc	%g1, TSTATE_PRIV, %g0
 		andcc	%g1, TSTATE_PRIV, %g0
@@ -54,7 +55,31 @@ etrap_irq:
 		rd	%y, %g3
 		rd	%y, %g3
 		stx	%g1, [%g2 + STACKFRAME_SZ + PT_V9_TNPC]
 		stx	%g1, [%g2 + STACKFRAME_SZ + PT_V9_TNPC]
 		st	%g3, [%g2 + STACKFRAME_SZ + PT_V9_Y]
 		st	%g3, [%g2 + STACKFRAME_SZ + PT_V9_Y]
-		save	%g2, -STACK_BIAS, %sp	! Ordering here is critical
+
+		rdpr	%cansave, %g1
+		brnz,pt %g1, etrap_save
+		 nop
+
+		rdpr	%cwp, %g1
+		add	%g1, 2, %g1
+		wrpr	%g1, %cwp
+		be,pt	%xcc, etrap_user_spill
+		 mov	ASI_AIUP, %g3
+
+		rdpr	%otherwin, %g3
+		brz	%g3, etrap_kernel_spill
+		 mov	ASI_AIUS, %g3
+
+etrap_user_spill:
+
+		wr	%g3, 0x0, %asi
+		ldx	[%g6 + TI_FLAGS], %g3
+		and	%g3, _TIF_32BIT, %g3
+		brnz,pt	%g3, etrap_user_spill_32bit
+		 nop
+		ba,a,pt	%xcc, etrap_user_spill_64bit
+
+etrap_save:	save	%g2, -STACK_BIAS, %sp
 		mov	%g6, %l6
 		mov	%g6, %l6
 
 
 		bne,pn	%xcc, 3f
 		bne,pn	%xcc, 3f
@@ -70,42 +95,56 @@ etrap_irq:
 		wrpr	%g2, 0, %wstate
 		wrpr	%g2, 0, %wstate
 		sethi	%hi(sparc64_kern_pri_context), %g2
 		sethi	%hi(sparc64_kern_pri_context), %g2
 		ldx	[%g2 + %lo(sparc64_kern_pri_context)], %g3
 		ldx	[%g2 + %lo(sparc64_kern_pri_context)], %g3
-		stxa	%g3, [%l4] ASI_DMMU
-		flush	%l6
-		wr	%g0, ASI_AIUS, %asi
-2:		wrpr	%g0, 0x0, %tl
-		mov	%g4, %l4
+
+661:		stxa	%g3, [%l4] ASI_DMMU
+		.section .sun4v_1insn_patch, "ax"
+		.word	661b
+		stxa	%g3, [%l4] ASI_MMU
+		.previous
+
+		sethi	%hi(KERNBASE), %l4
+		flush	%l4
+		mov	ASI_AIUS, %l7
+2:		mov	%g4, %l4
 		mov	%g5, %l5
 		mov	%g5, %l5
+		add	%g7, 4, %l2
+
+		/* Go to trap time globals so we can save them.  */
+661:		wrpr	%g0, ETRAP_PSTATE1, %pstate
+		.section .sun4v_1insn_patch, "ax"
+		.word	661b
+		SET_GL(0)
+		.previous
 
 
-		mov	%g7, %l2
-		wrpr	%g0, ETRAP_PSTATE1, %pstate
 		stx	%g1, [%sp + PTREGS_OFF + PT_V9_G1]
 		stx	%g1, [%sp + PTREGS_OFF + PT_V9_G1]
 		stx	%g2, [%sp + PTREGS_OFF + PT_V9_G2]
 		stx	%g2, [%sp + PTREGS_OFF + PT_V9_G2]
+		sllx	%l7, 24, %l7
 		stx	%g3, [%sp + PTREGS_OFF + PT_V9_G3]
 		stx	%g3, [%sp + PTREGS_OFF + PT_V9_G3]
+		rdpr	%cwp, %l0
 		stx	%g4, [%sp + PTREGS_OFF + PT_V9_G4]
 		stx	%g4, [%sp + PTREGS_OFF + PT_V9_G4]
 		stx	%g5, [%sp + PTREGS_OFF + PT_V9_G5]
 		stx	%g5, [%sp + PTREGS_OFF + PT_V9_G5]
 		stx	%g6, [%sp + PTREGS_OFF + PT_V9_G6]
 		stx	%g6, [%sp + PTREGS_OFF + PT_V9_G6]
-
 		stx	%g7, [%sp + PTREGS_OFF + PT_V9_G7]
 		stx	%g7, [%sp + PTREGS_OFF + PT_V9_G7]
+		or	%l7, %l0, %l7
+		sethi	%hi(TSTATE_RMO | TSTATE_PEF), %l0
+		or	%l7, %l0, %l7
+		wrpr	%l2, %tnpc
+		wrpr	%l7, (TSTATE_PRIV | TSTATE_IE), %tstate
 		stx	%i0, [%sp + PTREGS_OFF + PT_V9_I0]
 		stx	%i0, [%sp + PTREGS_OFF + PT_V9_I0]
 		stx	%i1, [%sp + PTREGS_OFF + PT_V9_I1]
 		stx	%i1, [%sp + PTREGS_OFF + PT_V9_I1]
 		stx	%i2, [%sp + PTREGS_OFF + PT_V9_I2]
 		stx	%i2, [%sp + PTREGS_OFF + PT_V9_I2]
 		stx	%i3, [%sp + PTREGS_OFF + PT_V9_I3]
 		stx	%i3, [%sp + PTREGS_OFF + PT_V9_I3]
 		stx	%i4, [%sp + PTREGS_OFF + PT_V9_I4]
 		stx	%i4, [%sp + PTREGS_OFF + PT_V9_I4]
 		stx	%i5, [%sp + PTREGS_OFF + PT_V9_I5]
 		stx	%i5, [%sp + PTREGS_OFF + PT_V9_I5]
-
 		stx	%i6, [%sp + PTREGS_OFF + PT_V9_I6]
 		stx	%i6, [%sp + PTREGS_OFF + PT_V9_I6]
-		stx	%i7, [%sp + PTREGS_OFF + PT_V9_I7]
-		wrpr	%g0, ETRAP_PSTATE2, %pstate
 		mov	%l6, %g6
 		mov	%l6, %g6
-#ifdef CONFIG_SMP
-		mov	TSB_REG, %g3
-		ldxa	[%g3] ASI_IMMU, %g5
-#endif
-		jmpl	%l2 + 0x4, %g0
-		 ldx	[%g6 + TI_TASK], %g4
+		stx	%i7, [%sp + PTREGS_OFF + PT_V9_I7]
+		LOAD_PER_CPU_BASE(%g5, %g6, %g4, %g3, %l1)
+		ldx	[%g6 + TI_TASK], %g4
+		done
 
 
-3:		ldub	[%l6 + TI_FPDEPTH], %l5
+3:		mov	ASI_P, %l7
+		ldub	[%l6 + TI_FPDEPTH], %l5
 		add	%l6, TI_FPSAVED + 1, %l4
 		add	%l6, TI_FPSAVED + 1, %l4
 		srl	%l5, 1, %l3
 		srl	%l5, 1, %l3
 		add	%l5, 2, %l5
 		add	%l5, 2, %l5
@@ -125,6 +164,7 @@ etraptl1:	/* Save tstate/tpc/tnpc of TL 1-->4 and the tl register itself.
 		 *	0x58	TL4's TT
 		 *	0x58	TL4's TT
 		 *	0x60	TL
 		 *	0x60	TL
 		 */
 		 */
+		TRAP_LOAD_THREAD_REG(%g6, %g1)
 		sub	%sp, ((4 * 8) * 4) + 8, %g2
 		sub	%sp, ((4 * 8) * 4) + 8, %g2
 		rdpr	%tl, %g1
 		rdpr	%tl, %g1
 
 
@@ -148,6 +188,11 @@ etraptl1:	/* Save tstate/tpc/tnpc of TL 1-->4 and the tl register itself.
 		rdpr	%tt, %g3
 		rdpr	%tt, %g3
 		stx	%g3, [%g2 + STACK_BIAS + 0x38]
 		stx	%g3, [%g2 + STACK_BIAS + 0x38]
 
 
+		sethi	%hi(is_sun4v), %g3
+		lduw	[%g3 + %lo(is_sun4v)], %g3
+		brnz,pn	%g3, finish_tl1_capture
+		 nop
+
 		wrpr	%g0, 3, %tl
 		wrpr	%g0, 3, %tl
 		rdpr	%tstate, %g3
 		rdpr	%tstate, %g3
 		stx	%g3, [%g2 + STACK_BIAS + 0x40]
 		stx	%g3, [%g2 + STACK_BIAS + 0x40]
@@ -168,91 +213,20 @@ etraptl1:	/* Save tstate/tpc/tnpc of TL 1-->4 and the tl register itself.
 		rdpr	%tt, %g3
 		rdpr	%tt, %g3
 		stx	%g3, [%g2 + STACK_BIAS + 0x78]
 		stx	%g3, [%g2 + STACK_BIAS + 0x78]
 
 
-		wrpr	%g1, %tl
 		stx	%g1, [%g2 + STACK_BIAS + 0x80]
 		stx	%g1, [%g2 + STACK_BIAS + 0x80]
 
 
+finish_tl1_capture:
+		wrpr	%g0, 1, %tl
+661:		nop
+		.section .sun4v_1insn_patch, "ax"
+		.word	661b
+		SET_GL(1)
+		.previous
+
 		rdpr	%tstate, %g1
 		rdpr	%tstate, %g1
 		sub	%g2, STACKFRAME_SZ + TRACEREG_SZ - STACK_BIAS, %g2
 		sub	%g2, STACKFRAME_SZ + TRACEREG_SZ - STACK_BIAS, %g2
 		ba,pt	%xcc, 1b
 		ba,pt	%xcc, 1b
 		 andcc	%g1, TSTATE_PRIV, %g0
 		 andcc	%g1, TSTATE_PRIV, %g0
 
 
-		.align	64
-		.globl	scetrap
-scetrap:	rdpr	%pil, %g2
-		rdpr	%tstate, %g1
-		sllx	%g2, 20, %g3
-		andcc	%g1, TSTATE_PRIV, %g0
-		or	%g1, %g3, %g1
-		bne,pn	%xcc, 1f
-		 sub	%sp, (STACKFRAME_SZ+TRACEREG_SZ-STACK_BIAS), %g2
-		wrpr	%g0, 7, %cleanwin
-
-		sllx	%g1, 51, %g3
-		sethi	%hi(TASK_REGOFF), %g2
-		or	%g2, %lo(TASK_REGOFF), %g2
-		brlz,pn	%g3, 1f
-		 add	%g6, %g2, %g2
-		wr	%g0, 0, %fprs
-1:		rdpr	%tpc, %g3
-		stx	%g1, [%g2 + STACKFRAME_SZ + PT_V9_TSTATE]
-
-		rdpr	%tnpc, %g1
-		stx	%g3, [%g2 + STACKFRAME_SZ + PT_V9_TPC]
-		stx	%g1, [%g2 + STACKFRAME_SZ + PT_V9_TNPC]
-		save	%g2, -STACK_BIAS, %sp	! Ordering here is critical
-		mov	%g6, %l6
-		bne,pn	%xcc, 2f
-		 mov	ASI_P, %l7
-		rdpr	%canrestore, %g3
-
-		rdpr	%wstate, %g2
-		wrpr	%g0, 0, %canrestore
-		sll	%g2, 3, %g2
-		mov	PRIMARY_CONTEXT, %l4
-		wrpr	%g3, 0, %otherwin
-		wrpr	%g2, 0, %wstate
-		sethi	%hi(sparc64_kern_pri_context), %g2
-		ldx	[%g2 + %lo(sparc64_kern_pri_context)], %g3
-		stxa	%g3, [%l4] ASI_DMMU
-		flush	%l6
-
-		mov	ASI_AIUS, %l7
-2:		mov	%g4, %l4
-		mov	%g5, %l5
-		add	%g7, 0x4, %l2
-		wrpr	%g0, ETRAP_PSTATE1, %pstate
-		stx	%g1, [%sp + PTREGS_OFF + PT_V9_G1]
-		stx	%g2, [%sp + PTREGS_OFF + PT_V9_G2]
-		sllx	%l7, 24, %l7
-
-		stx	%g3, [%sp + PTREGS_OFF + PT_V9_G3]
-		rdpr	%cwp, %l0
-		stx	%g4, [%sp + PTREGS_OFF + PT_V9_G4]
-		stx	%g5, [%sp + PTREGS_OFF + PT_V9_G5]
-		stx	%g6, [%sp + PTREGS_OFF + PT_V9_G6]
-		stx	%g7, [%sp + PTREGS_OFF + PT_V9_G7]
-		or	%l7, %l0, %l7
-		sethi	%hi(TSTATE_RMO | TSTATE_PEF), %l0
-
-		or	%l7, %l0, %l7
-		wrpr	%l2, %tnpc
-		wrpr	%l7, (TSTATE_PRIV | TSTATE_IE), %tstate
-		stx	%i0, [%sp + PTREGS_OFF + PT_V9_I0]
-		stx	%i1, [%sp + PTREGS_OFF + PT_V9_I1]
-		stx	%i2, [%sp + PTREGS_OFF + PT_V9_I2]
-		stx	%i3, [%sp + PTREGS_OFF + PT_V9_I3]
-		stx	%i4, [%sp + PTREGS_OFF + PT_V9_I4]
-
-		stx	%i5, [%sp + PTREGS_OFF + PT_V9_I5]
-		stx	%i6, [%sp + PTREGS_OFF + PT_V9_I6]
-		mov	%l6, %g6
-		stx	%i7, [%sp + PTREGS_OFF + PT_V9_I7]
-#ifdef CONFIG_SMP
-		mov	TSB_REG, %g3
-		ldxa	[%g3] ASI_IMMU, %g5
-#endif
-		ldx	[%g6 + TI_TASK], %g4
-		done
-
 #undef TASK_REGOFF
 #undef TASK_REGOFF
 #undef ETRAP_PSTATE1
 #undef ETRAP_PSTATE1

+ 172 - 82
arch/sparc64/kernel/head.S

@@ -26,6 +26,7 @@
 #include <asm/head.h>
 #include <asm/head.h>
 #include <asm/ttable.h>
 #include <asm/ttable.h>
 #include <asm/mmu.h>
 #include <asm/mmu.h>
+#include <asm/cpudata.h>
 	
 	
 /* This section from from _start to sparc64_boot_end should fit into
 /* This section from from _start to sparc64_boot_end should fit into
  * 0x0000000000404000 to 0x0000000000408000.
  * 0x0000000000404000 to 0x0000000000408000.
@@ -94,12 +95,17 @@ sparc64_boot:
 	wrpr	%g1, 0x0, %pstate
 	wrpr	%g1, 0x0, %pstate
 	ba,a,pt	%xcc, 1f
 	ba,a,pt	%xcc, 1f
 
 
-	.globl	prom_finddev_name, prom_chosen_path
-	.globl	prom_getprop_name, prom_mmu_name
-	.globl	prom_callmethod_name, prom_translate_name
+	.globl	prom_finddev_name, prom_chosen_path, prom_root_node
+	.globl	prom_getprop_name, prom_mmu_name, prom_peer_name
+	.globl	prom_callmethod_name, prom_translate_name, prom_root_compatible
 	.globl	prom_map_name, prom_unmap_name, prom_mmu_ihandle_cache
 	.globl	prom_map_name, prom_unmap_name, prom_mmu_ihandle_cache
 	.globl	prom_boot_mapped_pc, prom_boot_mapping_mode
 	.globl	prom_boot_mapped_pc, prom_boot_mapping_mode
 	.globl	prom_boot_mapping_phys_high, prom_boot_mapping_phys_low
 	.globl	prom_boot_mapping_phys_high, prom_boot_mapping_phys_low
+	.globl	is_sun4v
+prom_peer_name:
+	.asciz	"peer"
+prom_compatible_name:
+	.asciz	"compatible"
 prom_finddev_name:
 prom_finddev_name:
 	.asciz	"finddevice"
 	.asciz	"finddevice"
 prom_chosen_path:
 prom_chosen_path:
@@ -116,7 +122,13 @@ prom_map_name:
 	.asciz	"map"
 	.asciz	"map"
 prom_unmap_name:
 prom_unmap_name:
 	.asciz	"unmap"
 	.asciz	"unmap"
+prom_sun4v_name:
+	.asciz	"sun4v"
 	.align	4
 	.align	4
+prom_root_compatible:
+	.skip	64
+prom_root_node:
+	.word	0
 prom_mmu_ihandle_cache:
 prom_mmu_ihandle_cache:
 	.word	0
 	.word	0
 prom_boot_mapped_pc:
 prom_boot_mapped_pc:
@@ -128,8 +140,54 @@ prom_boot_mapping_phys_high:
 	.xword	0
 	.xword	0
 prom_boot_mapping_phys_low:
 prom_boot_mapping_phys_low:
 	.xword	0
 	.xword	0
+is_sun4v:
+	.word	0
 1:
 1:
 	rd	%pc, %l0
 	rd	%pc, %l0
+
+	mov	(1b - prom_peer_name), %l1
+	sub	%l0, %l1, %l1
+	mov	0, %l2
+
+	/* prom_root_node = prom_peer(0) */
+	stx	%l1, [%sp + 2047 + 128 + 0x00]	! service, "peer"
+	mov	1, %l3
+	stx	%l3, [%sp + 2047 + 128 + 0x08]	! num_args, 1
+	stx	%l3, [%sp + 2047 + 128 + 0x10]	! num_rets, 1
+	stx	%l2, [%sp + 2047 + 128 + 0x18]	! arg1, 0
+	stx	%g0, [%sp + 2047 + 128 + 0x20]	! ret1
+	call	%l7
+	 add	%sp, (2047 + 128), %o0		! argument array
+
+	ldx	[%sp + 2047 + 128 + 0x20], %l4	! prom root node
+	mov	(1b - prom_root_node), %l1
+	sub	%l0, %l1, %l1
+	stw	%l4, [%l1]
+
+	mov	(1b - prom_getprop_name), %l1
+	mov	(1b - prom_compatible_name), %l2
+	mov	(1b - prom_root_compatible), %l5
+	sub	%l0, %l1, %l1
+	sub	%l0, %l2, %l2
+	sub	%l0, %l5, %l5
+
+	/* prom_getproperty(prom_root_node, "compatible",
+	 *                  &prom_root_compatible, 64)
+	 */
+	stx	%l1, [%sp + 2047 + 128 + 0x00]	! service, "getprop"
+	mov	4, %l3
+	stx	%l3, [%sp + 2047 + 128 + 0x08]	! num_args, 4
+	mov	1, %l3
+	stx	%l3, [%sp + 2047 + 128 + 0x10]	! num_rets, 1
+	stx	%l4, [%sp + 2047 + 128 + 0x18]	! arg1, prom_root_node
+	stx	%l2, [%sp + 2047 + 128 + 0x20]	! arg2, "compatible"
+	stx	%l5, [%sp + 2047 + 128 + 0x28]	! arg3, &prom_root_compatible
+	mov	64, %l3
+	stx	%l3, [%sp + 2047 + 128 + 0x30]	! arg4, size
+	stx	%g0, [%sp + 2047 + 128 + 0x38]	! ret1
+	call	%l7
+	 add	%sp, (2047 + 128), %o0		! argument array
+
 	mov	(1b - prom_finddev_name), %l1
 	mov	(1b - prom_finddev_name), %l1
 	mov	(1b - prom_chosen_path), %l2
 	mov	(1b - prom_chosen_path), %l2
 	mov	(1b - prom_boot_mapped_pc), %l3
 	mov	(1b - prom_boot_mapped_pc), %l3
@@ -238,6 +296,27 @@ prom_boot_mapping_phys_low:
 	add	%sp, (192 + 128), %sp
 	add	%sp, (192 + 128), %sp
 
 
 sparc64_boot_after_remap:
 sparc64_boot_after_remap:
+	sethi	%hi(prom_root_compatible), %g1
+	or	%g1, %lo(prom_root_compatible), %g1
+	sethi	%hi(prom_sun4v_name), %g7
+	or	%g7, %lo(prom_sun4v_name), %g7
+	mov	5, %g3
+1:	ldub	[%g7], %g2
+	ldub	[%g1], %g4
+	cmp	%g2, %g4
+	bne,pn	%icc, 2f
+	 add	%g7, 1, %g7
+	subcc	%g3, 1, %g3
+	bne,pt	%xcc, 1b
+	 add	%g1, 1, %g1
+
+	sethi	%hi(is_sun4v), %g1
+	or	%g1, %lo(is_sun4v), %g1
+	mov	1, %g7
+	stw	%g7, [%g1]
+
+2:
+	BRANCH_IF_SUN4V(g1, jump_to_sun4u_init)
 	BRANCH_IF_CHEETAH_BASE(g1,g7,cheetah_boot)
 	BRANCH_IF_CHEETAH_BASE(g1,g7,cheetah_boot)
 	BRANCH_IF_CHEETAH_PLUS_OR_FOLLOWON(g1,g7,cheetah_plus_boot)
 	BRANCH_IF_CHEETAH_PLUS_OR_FOLLOWON(g1,g7,cheetah_plus_boot)
 	ba,pt	%xcc, spitfire_boot
 	ba,pt	%xcc, spitfire_boot
@@ -301,20 +380,58 @@ jump_to_sun4u_init:
 	 nop
 	 nop
 
 
 sun4u_init:
 sun4u_init:
+	BRANCH_IF_SUN4V(g1, sun4v_init)
+
 	/* Set ctx 0 */
 	/* Set ctx 0 */
-	mov	PRIMARY_CONTEXT, %g7
-	stxa	%g0, [%g7] ASI_DMMU
-	membar	#Sync
+	mov		PRIMARY_CONTEXT, %g7
+	stxa		%g0, [%g7] ASI_DMMU
+	membar		#Sync
 
 
-	mov	SECONDARY_CONTEXT, %g7
-	stxa	%g0, [%g7] ASI_DMMU
+	mov		SECONDARY_CONTEXT, %g7
+	stxa		%g0, [%g7] ASI_DMMU
 	membar	#Sync
 	membar	#Sync
 
 
-	BRANCH_IF_ANY_CHEETAH(g1,g7,cheetah_tlb_fixup)
+	ba,pt		%xcc, sun4u_continue
+	 nop
+
+sun4v_init:
+	/* Set ctx 0 */
+	mov		PRIMARY_CONTEXT, %g7
+	stxa		%g0, [%g7] ASI_MMU
+	membar		#Sync
+
+	mov		SECONDARY_CONTEXT, %g7
+	stxa		%g0, [%g7] ASI_MMU
+	membar		#Sync
+	ba,pt		%xcc, niagara_tlb_fixup
+	 nop
+
+sun4u_continue:
+	BRANCH_IF_ANY_CHEETAH(g1, g7, cheetah_tlb_fixup)
 
 
 	ba,pt	%xcc, spitfire_tlb_fixup
 	ba,pt	%xcc, spitfire_tlb_fixup
 	 nop
 	 nop
 
 
+niagara_tlb_fixup:
+	mov	3, %g2		/* Set TLB type to hypervisor. */
+	sethi	%hi(tlb_type), %g1
+	stw	%g2, [%g1 + %lo(tlb_type)]
+
+	/* Patch copy/clear ops.  */
+	call	niagara_patch_copyops
+	 nop
+	call	niagara_patch_bzero
+	 nop
+	call	niagara_patch_pageops
+	 nop
+
+	/* Patch TLB/cache ops.  */
+	call	hypervisor_patch_cachetlbops
+	 nop
+
+	ba,pt	%xcc, tlb_fixup_done
+	 nop
+
 cheetah_tlb_fixup:
 cheetah_tlb_fixup:
 	mov	2, %g2		/* Set TLB type to cheetah+. */
 	mov	2, %g2		/* Set TLB type to cheetah+. */
 	BRANCH_IF_CHEETAH_PLUS_OR_FOLLOWON(g1,g7,1f)
 	BRANCH_IF_CHEETAH_PLUS_OR_FOLLOWON(g1,g7,1f)
@@ -411,85 +528,55 @@ setup_trap_table:
 	wrpr	%g0, 15, %pil
 	wrpr	%g0, 15, %pil
 
 
 	/* Make the firmware call to jump over to the Linux trap table.  */
 	/* Make the firmware call to jump over to the Linux trap table.  */
-	call	prom_set_trap_table
-	 sethi	%hi(sparc64_ttable_tl0), %o0
+	sethi	%hi(is_sun4v), %o0
+	lduw	[%o0 + %lo(is_sun4v)], %o0
+	brz,pt	%o0, 1f
+	 nop
 
 
-	/* Start using proper page size encodings in ctx register.  */
-	sethi	%hi(sparc64_kern_pri_context), %g3
-	ldx	[%g3 + %lo(sparc64_kern_pri_context)], %g2
-	mov	PRIMARY_CONTEXT, %g1
-	stxa	%g2, [%g1] ASI_DMMU
-	membar	#Sync
+	TRAP_LOAD_TRAP_BLOCK(%g2, %g3)
+	add	%g2, TRAP_PER_CPU_FAULT_INFO, %g2
+	stxa	%g2, [%g0] ASI_SCRATCHPAD
 
 
-	/* The Linux trap handlers expect various trap global registers
-	 * to be setup with some fixed values.  So here we set these
-	 * up very carefully.  These globals are:
-	 *
-	 * Alternate Globals (PSTATE_AG):
-	 *
-	 * %g6			--> current_thread_info()
-	 *
-	 * MMU Globals (PSTATE_MG):
-	 *
-	 * %g1			--> TLB_SFSR
-	 * %g2			--> ((_PAGE_VALID | _PAGE_SZ4MB |
-	 *			      _PAGE_CP | _PAGE_CV | _PAGE_P | _PAGE_W)
-	 *			     ^ 0xfffff80000000000)
-	 * (this %g2 value is used for computing the PAGE_OFFSET kernel
-	 *  TLB entries quickly, the virtual address of the fault XOR'd
-	 *  with this %g2 value is the PTE to load into the TLB)
-	 * %g3			--> VPTE_BASE_CHEETAH or VPTE_BASE_SPITFIRE
+	/* Compute physical address:
 	 *
 	 *
-	 * Interrupt Globals (PSTATE_IG, setup by init_irqwork_curcpu()):
-	 *
-	 * %g6			--> __irq_work[smp_processor_id()]
+	 * paddr = kern_base + (mmfsa_vaddr - KERNBASE)
 	 */
 	 */
+	sethi	%hi(KERNBASE), %g3
+	sub	%g2, %g3, %g2
+	sethi	%hi(kern_base), %g3
+	ldx	[%g3 + %lo(kern_base)], %g3
+	add	%g2, %g3, %o1
 
 
-	rdpr	%pstate, %o1
-	mov	%g6, %o2
-	wrpr	%o1, PSTATE_AG, %pstate
-	mov	%o2, %g6
-
-#define KERN_HIGHBITS		((_PAGE_VALID|_PAGE_SZ4MB)^0xfffff80000000000)
-#define KERN_LOWBITS		(_PAGE_CP | _PAGE_CV | _PAGE_P | _PAGE_W)
-	wrpr	%o1, PSTATE_MG, %pstate
-	mov	TSB_REG, %g1
-	stxa	%g0, [%g1] ASI_DMMU
-	membar	#Sync
-	stxa	%g0, [%g1] ASI_IMMU
-	membar	#Sync
-	mov	TLB_SFSR, %g1
-	sethi	%uhi(KERN_HIGHBITS), %g2
-	or	%g2, %ulo(KERN_HIGHBITS), %g2
-	sllx	%g2, 32, %g2
-	or	%g2, KERN_LOWBITS, %g2
-
-	BRANCH_IF_ANY_CHEETAH(g3,g7,8f)
-	ba,pt	%xcc, 9f
+	call	prom_set_trap_table_sun4v
+	 sethi	%hi(sparc64_ttable_tl0), %o0
+
+	ba,pt	%xcc, 2f
 	 nop
 	 nop
 
 
-8:
-	sethi		%uhi(VPTE_BASE_CHEETAH), %g3
-	or		%g3, %ulo(VPTE_BASE_CHEETAH), %g3
-	ba,pt		%xcc, 2f
-	 sllx		%g3, 32, %g3
+1:	call	prom_set_trap_table
+	 sethi	%hi(sparc64_ttable_tl0), %o0
 
 
-9:
-	sethi		%uhi(VPTE_BASE_SPITFIRE), %g3
-	or		%g3, %ulo(VPTE_BASE_SPITFIRE), %g3
-	sllx		%g3, 32, %g3
+	/* Start using proper page size encodings in ctx register.  */
+2:	sethi	%hi(sparc64_kern_pri_context), %g3
+	ldx	[%g3 + %lo(sparc64_kern_pri_context)], %g2
 
 
-2:
-	clr	%g7
-#undef KERN_HIGHBITS
-#undef KERN_LOWBITS
+	mov		PRIMARY_CONTEXT, %g1
+
+661:	stxa		%g2, [%g1] ASI_DMMU
+	.section	.sun4v_1insn_patch, "ax"
+	.word		661b
+	stxa		%g2, [%g1] ASI_MMU
+	.previous
+
+	membar	#Sync
 
 
 	/* Kill PROM timer */
 	/* Kill PROM timer */
 	sethi	%hi(0x80000000), %o2
 	sethi	%hi(0x80000000), %o2
 	sllx	%o2, 32, %o2
 	sllx	%o2, 32, %o2
 	wr	%o2, 0, %tick_cmpr
 	wr	%o2, 0, %tick_cmpr
 
 
-	BRANCH_IF_ANY_CHEETAH(o2,o3,1f)
+	BRANCH_IF_SUN4V(o2, 1f)
+	BRANCH_IF_ANY_CHEETAH(o2, o3, 1f)
 
 
 	ba,pt	%xcc, 2f
 	ba,pt	%xcc, 2f
 	 nop
 	 nop
@@ -502,7 +589,6 @@ setup_trap_table:
 
 
 2:
 2:
 	wrpr	%g0, %g0, %wstate
 	wrpr	%g0, %g0, %wstate
-	wrpr	%o1, 0x0, %pstate
 
 
 	call	init_irqwork_curcpu
 	call	init_irqwork_curcpu
 	 nop
 	 nop
@@ -517,7 +603,7 @@ setup_trap_table:
 	 restore
 	 restore
 
 
 	.globl	setup_tba
 	.globl	setup_tba
-setup_tba:	/* i0 = is_starfire */
+setup_tba:
 	save	%sp, -192, %sp
 	save	%sp, -192, %sp
 
 
 	/* The boot processor is the only cpu which invokes this
 	/* The boot processor is the only cpu which invokes this
@@ -536,31 +622,35 @@ setup_tba:	/* i0 = is_starfire */
 	 restore
 	 restore
 sparc64_boot_end:
 sparc64_boot_end:
 
 
-#include "systbls.S"
 #include "ktlb.S"
 #include "ktlb.S"
+#include "tsb.S"
 #include "etrap.S"
 #include "etrap.S"
 #include "rtrap.S"
 #include "rtrap.S"
 #include "winfixup.S"
 #include "winfixup.S"
 #include "entry.S"
 #include "entry.S"
+#include "sun4v_tlb_miss.S"
+#include "sun4v_ivec.S"
 
 
 /*
 /*
  * The following skip makes sure the trap table in ttable.S is aligned
  * The following skip makes sure the trap table in ttable.S is aligned
  * on a 32K boundary as required by the v9 specs for TBA register.
  * on a 32K boundary as required by the v9 specs for TBA register.
+ *
+ * We align to a 32K boundary, then we have the 32K kernel TSB,
+ * then the 32K aligned trap table.
  */
  */
 1:
 1:
 	.skip	0x4000 + _start - 1b
 	.skip	0x4000 + _start - 1b
 
 
-#ifdef CONFIG_SBUS
-/* This is just a hack to fool make depend config.h discovering
-   strategy: As the .S files below need config.h, but
-   make depend does not find it for them, we include config.h
-   in head.S */
-#endif
+	.globl	swapper_tsb
+swapper_tsb:
+	.skip	(32 * 1024)
 
 
 ! 0x0000000000408000
 ! 0x0000000000408000
 
 
 #include "ttable.S"
 #include "ttable.S"
 
 
+#include "systbls.S"
+
 	.data
 	.data
 	.align	8
 	.align	8
 	.globl	prom_tba, tlb_type
 	.globl	prom_tba, tlb_type

+ 252 - 87
arch/sparc64/kernel/irq.c

@@ -21,6 +21,7 @@
 #include <linux/delay.h>
 #include <linux/delay.h>
 #include <linux/proc_fs.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/seq_file.h>
+#include <linux/bootmem.h>
 
 
 #include <asm/ptrace.h>
 #include <asm/ptrace.h>
 #include <asm/processor.h>
 #include <asm/processor.h>
@@ -39,6 +40,7 @@
 #include <asm/cache.h>
 #include <asm/cache.h>
 #include <asm/cpudata.h>
 #include <asm/cpudata.h>
 #include <asm/auxio.h>
 #include <asm/auxio.h>
+#include <asm/head.h>
 
 
 #ifdef CONFIG_SMP
 #ifdef CONFIG_SMP
 static void distribute_irqs(void);
 static void distribute_irqs(void);
@@ -136,12 +138,48 @@ out_unlock:
 	return 0;
 	return 0;
 }
 }
 
 
+extern unsigned long real_hard_smp_processor_id(void);
+
+static unsigned int sun4u_compute_tid(unsigned long imap, unsigned long cpuid)
+{
+	unsigned int tid;
+
+	if (this_is_starfire) {
+		tid = starfire_translate(imap, cpuid);
+		tid <<= IMAP_TID_SHIFT;
+		tid &= IMAP_TID_UPA;
+	} else {
+		if (tlb_type == cheetah || tlb_type == cheetah_plus) {
+			unsigned long ver;
+
+			__asm__ ("rdpr %%ver, %0" : "=r" (ver));
+			if ((ver >> 32UL) == __JALAPENO_ID ||
+			    (ver >> 32UL) == __SERRANO_ID) {
+				tid = cpuid << IMAP_TID_SHIFT;
+				tid &= IMAP_TID_JBUS;
+			} else {
+				unsigned int a = cpuid & 0x1f;
+				unsigned int n = (cpuid >> 5) & 0x1f;
+
+				tid = ((a << IMAP_AID_SHIFT) |
+				       (n << IMAP_NID_SHIFT));
+				tid &= (IMAP_AID_SAFARI |
+					IMAP_NID_SAFARI);;
+			}
+		} else {
+			tid = cpuid << IMAP_TID_SHIFT;
+			tid &= IMAP_TID_UPA;
+		}
+	}
+
+	return tid;
+}
+
 /* Now these are always passed a true fully specified sun4u INO. */
 /* Now these are always passed a true fully specified sun4u INO. */
 void enable_irq(unsigned int irq)
 void enable_irq(unsigned int irq)
 {
 {
 	struct ino_bucket *bucket = __bucket(irq);
 	struct ino_bucket *bucket = __bucket(irq);
-	unsigned long imap;
-	unsigned long tid;
+	unsigned long imap, cpuid;
 
 
 	imap = bucket->imap;
 	imap = bucket->imap;
 	if (imap == 0UL)
 	if (imap == 0UL)
@@ -149,47 +187,38 @@ void enable_irq(unsigned int irq)
 
 
 	preempt_disable();
 	preempt_disable();
 
 
-	if (tlb_type == cheetah || tlb_type == cheetah_plus) {
-		unsigned long ver;
-
-		__asm__ ("rdpr %%ver, %0" : "=r" (ver));
-		if ((ver >> 32) == 0x003e0016) {
-			/* We set it to our JBUS ID. */
-			__asm__ __volatile__("ldxa [%%g0] %1, %0"
-					     : "=r" (tid)
-					     : "i" (ASI_JBUS_CONFIG));
-			tid = ((tid & (0x1fUL<<17)) << 9);
-			tid &= IMAP_TID_JBUS;
-		} else {
-			/* We set it to our Safari AID. */
-			__asm__ __volatile__("ldxa [%%g0] %1, %0"
-					     : "=r" (tid)
-					     : "i" (ASI_SAFARI_CONFIG));
-			tid = ((tid & (0x3ffUL<<17)) << 9);
-			tid &= IMAP_AID_SAFARI;
-		}
-	} else if (this_is_starfire == 0) {
-		/* We set it to our UPA MID. */
-		__asm__ __volatile__("ldxa [%%g0] %1, %0"
-				     : "=r" (tid)
-				     : "i" (ASI_UPA_CONFIG));
-		tid = ((tid & UPA_CONFIG_MID) << 9);
-		tid &= IMAP_TID_UPA;
+	/* This gets the physical processor ID, even on uniprocessor,
+	 * so we can always program the interrupt target correctly.
+	 */
+	cpuid = real_hard_smp_processor_id();
+
+	if (tlb_type == hypervisor) {
+		unsigned int ino = __irq_ino(irq);
+		int err;
+
+		err = sun4v_intr_settarget(ino, cpuid);
+		if (err != HV_EOK)
+			printk("sun4v_intr_settarget(%x,%lu): err(%d)\n",
+			       ino, cpuid, err);
+		err = sun4v_intr_setenabled(ino, HV_INTR_ENABLED);
+		if (err != HV_EOK)
+			printk("sun4v_intr_setenabled(%x): err(%d)\n",
+			       ino, err);
 	} else {
 	} else {
-		tid = (starfire_translate(imap, smp_processor_id()) << 26);
-		tid &= IMAP_TID_UPA;
+		unsigned int tid = sun4u_compute_tid(imap, cpuid);
+
+		/* NOTE NOTE NOTE, IGN and INO are read-only, IGN is a product
+		 * of this SYSIO's preconfigured IGN in the SYSIO Control
+		 * Register, the hardware just mirrors that value here.
+		 * However for Graphics and UPA Slave devices the full
+		 * IMAP_INR field can be set by the programmer here.
+		 *
+		 * Things like FFB can now be handled via the new IRQ
+		 * mechanism.
+		 */
+		upa_writel(tid | IMAP_VALID, imap);
 	}
 	}
 
 
-	/* NOTE NOTE NOTE, IGN and INO are read-only, IGN is a product
-	 * of this SYSIO's preconfigured IGN in the SYSIO Control
-	 * Register, the hardware just mirrors that value here.
-	 * However for Graphics and UPA Slave devices the full
-	 * IMAP_INR field can be set by the programmer here.
-	 *
-	 * Things like FFB can now be handled via the new IRQ mechanism.
-	 */
-	upa_writel(tid | IMAP_VALID, imap);
-
 	preempt_enable();
 	preempt_enable();
 }
 }
 
 
@@ -201,16 +230,26 @@ void disable_irq(unsigned int irq)
 
 
 	imap = bucket->imap;
 	imap = bucket->imap;
 	if (imap != 0UL) {
 	if (imap != 0UL) {
-		u32 tmp;
+		if (tlb_type == hypervisor) {
+			unsigned int ino = __irq_ino(irq);
+			int err;
+
+			err = sun4v_intr_setenabled(ino, HV_INTR_DISABLED);
+			if (err != HV_EOK)
+				printk("sun4v_intr_setenabled(%x): "
+				       "err(%d)\n", ino, err);
+		} else {
+			u32 tmp;
 
 
-		/* NOTE: We do not want to futz with the IRQ clear registers
-		 *       and move the state to IDLE, the SCSI code does call
-		 *       disable_irq() to assure atomicity in the queue cmd
-		 *       SCSI adapter driver code.  Thus we'd lose interrupts.
-		 */
-		tmp = upa_readl(imap);
-		tmp &= ~IMAP_VALID;
-		upa_writel(tmp, imap);
+			/* NOTE: We do not want to futz with the IRQ clear registers
+			 *       and move the state to IDLE, the SCSI code does call
+			 *       disable_irq() to assure atomicity in the queue cmd
+			 *       SCSI adapter driver code.  Thus we'd lose interrupts.
+			 */
+			tmp = upa_readl(imap);
+			tmp &= ~IMAP_VALID;
+			upa_writel(tmp, imap);
+		}
 	}
 	}
 }
 }
 
 
@@ -248,6 +287,8 @@ unsigned int build_irq(int pil, int inofixup, unsigned long iclr, unsigned long
 		return __irq(&pil0_dummy_bucket);
 		return __irq(&pil0_dummy_bucket);
 	}
 	}
 
 
+	BUG_ON(tlb_type == hypervisor);
+
 	/* RULE: Both must be specified in all other cases. */
 	/* RULE: Both must be specified in all other cases. */
 	if (iclr == 0UL || imap == 0UL) {
 	if (iclr == 0UL || imap == 0UL) {
 		prom_printf("Invalid build_irq %d %d %016lx %016lx\n",
 		prom_printf("Invalid build_irq %d %d %016lx %016lx\n",
@@ -275,12 +316,11 @@ unsigned int build_irq(int pil, int inofixup, unsigned long iclr, unsigned long
 		goto out;
 		goto out;
 	}
 	}
 
 
-	bucket->irq_info = kmalloc(sizeof(struct irq_desc), GFP_ATOMIC);
+	bucket->irq_info = kzalloc(sizeof(struct irq_desc), GFP_ATOMIC);
 	if (!bucket->irq_info) {
 	if (!bucket->irq_info) {
 		prom_printf("IRQ: Error, kmalloc(irq_desc) failed.\n");
 		prom_printf("IRQ: Error, kmalloc(irq_desc) failed.\n");
 		prom_halt();
 		prom_halt();
 	}
 	}
-	memset(bucket->irq_info, 0, sizeof(struct irq_desc));
 
 
 	/* Ok, looks good, set it up.  Don't touch the irq_chain or
 	/* Ok, looks good, set it up.  Don't touch the irq_chain or
 	 * the pending flag.
 	 * the pending flag.
@@ -294,6 +334,37 @@ out:
 	return __irq(bucket);
 	return __irq(bucket);
 }
 }
 
 
+unsigned int sun4v_build_irq(u32 devhandle, unsigned int devino, int pil, unsigned char flags)
+{
+	struct ino_bucket *bucket;
+	unsigned long sysino;
+
+	sysino = sun4v_devino_to_sysino(devhandle, devino);
+
+	bucket = &ivector_table[sysino];
+
+	/* Catch accidental accesses to these things.  IMAP/ICLR handling
+	 * is done by hypervisor calls on sun4v platforms, not by direct
+	 * register accesses.
+	 *
+	 * But we need to make them look unique for the disable_irq() logic
+	 * in free_irq().
+	 */
+	bucket->imap = ~0UL - sysino;
+	bucket->iclr = ~0UL - sysino;
+
+	bucket->pil = pil;
+	bucket->flags = flags;
+
+	bucket->irq_info = kzalloc(sizeof(struct irq_desc), GFP_ATOMIC);
+	if (!bucket->irq_info) {
+		prom_printf("IRQ: Error, kmalloc(irq_desc) failed.\n");
+		prom_halt();
+	}
+
+	return __irq(bucket);
+}
+
 static void atomic_bucket_insert(struct ino_bucket *bucket)
 static void atomic_bucket_insert(struct ino_bucket *bucket)
 {
 {
 	unsigned long pstate;
 	unsigned long pstate;
@@ -482,7 +553,6 @@ void free_irq(unsigned int irq, void *dev_id)
 	bucket = __bucket(irq);
 	bucket = __bucket(irq);
 	if (bucket != &pil0_dummy_bucket) {
 	if (bucket != &pil0_dummy_bucket) {
 		struct irq_desc *desc = bucket->irq_info;
 		struct irq_desc *desc = bucket->irq_info;
-		unsigned long imap = bucket->imap;
 		int ent, i;
 		int ent, i;
 
 
 		for (i = 0; i < MAX_IRQ_DESC_ACTION; i++) {
 		for (i = 0; i < MAX_IRQ_DESC_ACTION; i++) {
@@ -495,6 +565,8 @@ void free_irq(unsigned int irq, void *dev_id)
 		}
 		}
 
 
 		if (!desc->action_active_mask) {
 		if (!desc->action_active_mask) {
+			unsigned long imap = bucket->imap;
+
 			/* This unique interrupt source is now inactive. */
 			/* This unique interrupt source is now inactive. */
 			bucket->flags &= ~IBF_ACTIVE;
 			bucket->flags &= ~IBF_ACTIVE;
 
 
@@ -592,7 +664,18 @@ static void process_bucket(int irq, struct ino_bucket *bp, struct pt_regs *regs)
 			break;
 			break;
 	}
 	}
 	if (bp->pil != 0) {
 	if (bp->pil != 0) {
-		upa_writel(ICLR_IDLE, bp->iclr);
+		if (tlb_type == hypervisor) {
+			unsigned int ino = __irq_ino(bp);
+			int err;
+
+			err = sun4v_intr_setstate(ino, HV_INTR_STATE_IDLE);
+			if (err != HV_EOK)
+				printk("sun4v_intr_setstate(%x): "
+				       "err(%d)\n", ino, err);
+		} else {
+			upa_writel(ICLR_IDLE, bp->iclr);
+		}
+
 		/* Test and add entropy */
 		/* Test and add entropy */
 		if (random & SA_SAMPLE_RANDOM)
 		if (random & SA_SAMPLE_RANDOM)
 			add_interrupt_randomness(irq);
 			add_interrupt_randomness(irq);
@@ -694,7 +777,7 @@ irqreturn_t sparc_floppy_irq(int irq, void *dev_cookie, struct pt_regs *regs)
 		val = readb(auxio_register);
 		val = readb(auxio_register);
 		val |= AUXIO_AUX1_FTCNT;
 		val |= AUXIO_AUX1_FTCNT;
 		writeb(val, auxio_register);
 		writeb(val, auxio_register);
-		val &= AUXIO_AUX1_FTCNT;
+		val &= ~AUXIO_AUX1_FTCNT;
 		writeb(val, auxio_register);
 		writeb(val, auxio_register);
 
 
 		doing_pdma = 0;
 		doing_pdma = 0;
@@ -727,25 +810,23 @@ EXPORT_SYMBOL(probe_irq_off);
 static int retarget_one_irq(struct irqaction *p, int goal_cpu)
 static int retarget_one_irq(struct irqaction *p, int goal_cpu)
 {
 {
 	struct ino_bucket *bucket = get_ino_in_irqaction(p) + ivector_table;
 	struct ino_bucket *bucket = get_ino_in_irqaction(p) + ivector_table;
-	unsigned long imap = bucket->imap;
-	unsigned int tid;
 
 
 	while (!cpu_online(goal_cpu)) {
 	while (!cpu_online(goal_cpu)) {
 		if (++goal_cpu >= NR_CPUS)
 		if (++goal_cpu >= NR_CPUS)
 			goal_cpu = 0;
 			goal_cpu = 0;
 	}
 	}
 
 
-	if (tlb_type == cheetah || tlb_type == cheetah_plus) {
-		tid = goal_cpu << 26;
-		tid &= IMAP_AID_SAFARI;
-	} else if (this_is_starfire == 0) {
-		tid = goal_cpu << 26;
-		tid &= IMAP_TID_UPA;
+	if (tlb_type == hypervisor) {
+		unsigned int ino = __irq_ino(bucket);
+
+		sun4v_intr_settarget(ino, goal_cpu);
+		sun4v_intr_setenabled(ino, HV_INTR_ENABLED);
 	} else {
 	} else {
-		tid = (starfire_translate(imap, goal_cpu) << 26);
-		tid &= IMAP_TID_UPA;
+		unsigned long imap = bucket->imap;
+		unsigned int tid = sun4u_compute_tid(imap, goal_cpu);
+
+		upa_writel(tid | IMAP_VALID, imap);
 	}
 	}
-	upa_writel(tid | IMAP_VALID, imap);
 
 
 	do {
 	do {
 		if (++goal_cpu >= NR_CPUS)
 		if (++goal_cpu >= NR_CPUS)
@@ -848,33 +929,114 @@ static void kill_prom_timer(void)
 
 
 void init_irqwork_curcpu(void)
 void init_irqwork_curcpu(void)
 {
 {
-	register struct irq_work_struct *workp asm("o2");
-	register unsigned long tmp asm("o3");
 	int cpu = hard_smp_processor_id();
 	int cpu = hard_smp_processor_id();
 
 
-	memset(__irq_work + cpu, 0, sizeof(*workp));
-
-	/* Make sure we are called with PSTATE_IE disabled.  */
-	__asm__ __volatile__("rdpr	%%pstate, %0\n\t"
-			     : "=r" (tmp));
-	if (tmp & PSTATE_IE) {
-		prom_printf("BUG: init_irqwork_curcpu() called with "
-			    "PSTATE_IE enabled, bailing.\n");
-		__asm__ __volatile__("mov	%%i7, %0\n\t"
-				     : "=r" (tmp));
-		prom_printf("BUG: Called from %lx\n", tmp);
+	memset(__irq_work + cpu, 0, sizeof(struct irq_work_struct));
+}
+
+static void __cpuinit register_one_mondo(unsigned long paddr, unsigned long type)
+{
+	unsigned long num_entries = 128;
+	unsigned long status;
+
+	status = sun4v_cpu_qconf(type, paddr, num_entries);
+	if (status != HV_EOK) {
+		prom_printf("SUN4V: sun4v_cpu_qconf(%lu:%lx:%lu) failed, "
+			    "err %lu\n", type, paddr, num_entries, status);
 		prom_halt();
 		prom_halt();
 	}
 	}
+}
 
 
-	/* Set interrupt globals.  */
-	workp = &__irq_work[cpu];
-	__asm__ __volatile__(
-	"rdpr	%%pstate, %0\n\t"
-	"wrpr	%0, %1, %%pstate\n\t"
-	"mov	%2, %%g6\n\t"
-	"wrpr	%0, 0x0, %%pstate\n\t"
-	: "=&r" (tmp)
-	: "i" (PSTATE_IG), "r" (workp));
+static void __cpuinit sun4v_register_mondo_queues(int this_cpu)
+{
+	struct trap_per_cpu *tb = &trap_block[this_cpu];
+
+	register_one_mondo(tb->cpu_mondo_pa, HV_CPU_QUEUE_CPU_MONDO);
+	register_one_mondo(tb->dev_mondo_pa, HV_CPU_QUEUE_DEVICE_MONDO);
+	register_one_mondo(tb->resum_mondo_pa, HV_CPU_QUEUE_RES_ERROR);
+	register_one_mondo(tb->nonresum_mondo_pa, HV_CPU_QUEUE_NONRES_ERROR);
+}
+
+static void __cpuinit alloc_one_mondo(unsigned long *pa_ptr, int use_bootmem)
+{
+	void *page;
+
+	if (use_bootmem)
+		page = alloc_bootmem_low_pages(PAGE_SIZE);
+	else
+		page = (void *) get_zeroed_page(GFP_ATOMIC);
+
+	if (!page) {
+		prom_printf("SUN4V: Error, cannot allocate mondo queue.\n");
+		prom_halt();
+	}
+
+	*pa_ptr = __pa(page);
+}
+
+static void __cpuinit alloc_one_kbuf(unsigned long *pa_ptr, int use_bootmem)
+{
+	void *page;
+
+	if (use_bootmem)
+		page = alloc_bootmem_low_pages(PAGE_SIZE);
+	else
+		page = (void *) get_zeroed_page(GFP_ATOMIC);
+
+	if (!page) {
+		prom_printf("SUN4V: Error, cannot allocate kbuf page.\n");
+		prom_halt();
+	}
+
+	*pa_ptr = __pa(page);
+}
+
+static void __cpuinit init_cpu_send_mondo_info(struct trap_per_cpu *tb, int use_bootmem)
+{
+#ifdef CONFIG_SMP
+	void *page;
+
+	BUILD_BUG_ON((NR_CPUS * sizeof(u16)) > (PAGE_SIZE - 64));
+
+	if (use_bootmem)
+		page = alloc_bootmem_low_pages(PAGE_SIZE);
+	else
+		page = (void *) get_zeroed_page(GFP_ATOMIC);
+
+	if (!page) {
+		prom_printf("SUN4V: Error, cannot allocate cpu mondo page.\n");
+		prom_halt();
+	}
+
+	tb->cpu_mondo_block_pa = __pa(page);
+	tb->cpu_list_pa = __pa(page + 64);
+#endif
+}
+
+/* Allocate and register the mondo and error queues for this cpu.  */
+void __cpuinit sun4v_init_mondo_queues(int use_bootmem, int cpu, int alloc, int load)
+{
+	struct trap_per_cpu *tb = &trap_block[cpu];
+
+	if (alloc) {
+		alloc_one_mondo(&tb->cpu_mondo_pa, use_bootmem);
+		alloc_one_mondo(&tb->dev_mondo_pa, use_bootmem);
+		alloc_one_mondo(&tb->resum_mondo_pa, use_bootmem);
+		alloc_one_kbuf(&tb->resum_kernel_buf_pa, use_bootmem);
+		alloc_one_mondo(&tb->nonresum_mondo_pa, use_bootmem);
+		alloc_one_kbuf(&tb->nonresum_kernel_buf_pa, use_bootmem);
+
+		init_cpu_send_mondo_info(tb, use_bootmem);
+	}
+
+	if (load) {
+		if (cpu != hard_smp_processor_id()) {
+			prom_printf("SUN4V: init mondo on cpu %d not %d\n",
+				    cpu, hard_smp_processor_id());
+			prom_halt();
+		}
+		sun4v_register_mondo_queues(cpu);
+	}
 }
 }
 
 
 /* Only invoked on boot processor. */
 /* Only invoked on boot processor. */
@@ -884,6 +1046,9 @@ void __init init_IRQ(void)
 	kill_prom_timer();
 	kill_prom_timer();
 	memset(&ivector_table[0], 0, sizeof(ivector_table));
 	memset(&ivector_table[0], 0, sizeof(ivector_table));
 
 
+	if (tlb_type == hypervisor)
+		sun4v_init_mondo_queues(1, hard_smp_processor_id(), 1, 1);
+
 	/* We need to clear any IRQ's pending in the soft interrupt
 	/* We need to clear any IRQ's pending in the soft interrupt
 	 * registers, a spurious one could be left around from the
 	 * registers, a spurious one could be left around from the
 	 * PROM timer which we just disabled.
 	 * PROM timer which we just disabled.

+ 0 - 79
arch/sparc64/kernel/itlb_base.S

@@ -1,79 +0,0 @@
-/* $Id: itlb_base.S,v 1.12 2002/02/09 19:49:30 davem Exp $
- * itlb_base.S:	Front end to ITLB miss replacement strategy.
- *              This is included directly into the trap table.
- *
- * Copyright (C) 1996,1998 David S. Miller (davem@redhat.com)
- * Copyright (C) 1997,1998 Jakub Jelinek   (jj@ultra.linux.cz)
- */
-
-#if PAGE_SHIFT == 13
-/*
- * To compute vpte offset, we need to do ((addr >> 13) << 3),
- * which can be optimized to (addr >> 10) if bits 10/11/12 can
- * be guaranteed to be 0 ... mmu_context.h does guarantee this
- * by only using 10 bits in the hwcontext value.
- */
-#define CREATE_VPTE_OFFSET1(r1, r2) \
-				srax	r1, 10, r2
-#define CREATE_VPTE_OFFSET2(r1, r2) nop
-#else /* PAGE_SHIFT */
-#define CREATE_VPTE_OFFSET1(r1, r2) \
-				srax	r1, PAGE_SHIFT, r2
-#define CREATE_VPTE_OFFSET2(r1, r2) \
-				sllx	r2, 3, r2
-#endif /* PAGE_SHIFT */
-
-
-/* Ways we can get here:
- *
- * 1) Nucleus instruction misses from module code.
- * 2) All user instruction misses.
- *
- * All real page faults merge their code paths to the
- * sparc64_realfault_common label below.
- */
-
-/* ITLB ** ICACHE line 1: Quick user TLB misses		*/
-	mov		TLB_SFSR, %g1
-	ldxa		[%g1 + %g1] ASI_IMMU, %g4	! Get TAG_ACCESS
-	CREATE_VPTE_OFFSET1(%g4, %g6)			! Create VPTE offset
-	CREATE_VPTE_OFFSET2(%g4, %g6)			! Create VPTE offset
-	ldxa		[%g3 + %g6] ASI_P, %g5		! Load VPTE
-1:	brgez,pn	%g5, 3f				! Not valid, branch out
-	 sethi		%hi(_PAGE_EXEC), %g4		! Delay-slot
-	andcc		%g5, %g4, %g0			! Executable?
-
-/* ITLB ** ICACHE line 2: Real faults			*/
-	be,pn		%xcc, 3f			! Nope, branch.
-	 nop						! Delay-slot
-2:	stxa		%g5, [%g0] ASI_ITLB_DATA_IN	! Load PTE into TLB
-	retry						! Trap return
-3:	rdpr		%pstate, %g4			! Move into alt-globals
-	wrpr		%g4, PSTATE_AG|PSTATE_MG, %pstate
-	rdpr		%tpc, %g5			! And load faulting VA
-	mov		FAULT_CODE_ITLB, %g4		! It was read from ITLB
-
-/* ITLB ** ICACHE line 3: Finish faults	*/
-sparc64_realfault_common:				! Called by dtlb_miss
-	stb		%g4, [%g6 + TI_FAULT_CODE]
-	stx		%g5, [%g6 + TI_FAULT_ADDR]
-	ba,pt		%xcc, etrap			! Save state
-1:	 rd		%pc, %g7			! ...
-	call		do_sparc64_fault		! Call fault handler
-	 add		%sp, PTREGS_OFF, %o0! Compute pt_regs arg
-	ba,pt		%xcc, rtrap_clr_l6		! Restore cpu state
-	 nop
-
-/* ITLB ** ICACHE line 4: Window fixups */
-winfix_trampoline:
-	rdpr		%tpc, %g3			! Prepare winfixup TNPC
-	or		%g3, 0x7c, %g3			! Compute branch offset
-	wrpr		%g3, %tnpc			! Write it into TNPC
-	done						! Do it to it
-	nop
-	nop
-	nop
-	nop
-
-#undef CREATE_VPTE_OFFSET1
-#undef CREATE_VPTE_OFFSET2

+ 39 - 0
arch/sparc64/kernel/itlb_miss.S

@@ -0,0 +1,39 @@
+/* ITLB ** ICACHE line 1: Context 0 check and TSB load	*/
+	ldxa	[%g0] ASI_IMMU_TSB_8KB_PTR, %g1	! Get TSB 8K pointer
+	ldxa	[%g0] ASI_IMMU, %g6		! Get TAG TARGET
+	srlx	%g6, 48, %g5			! Get context
+	sllx	%g6, 22, %g6			! Zero out context
+	brz,pn	%g5, kvmap_itlb			! Context 0 processing
+	 srlx	%g6, 22, %g6			! Delay slot
+	TSB_LOAD_QUAD(%g1, %g4)			! Load TSB entry
+	cmp	%g4, %g6			! Compare TAG
+
+/* ITLB ** ICACHE line 2: TSB compare and TLB load	*/
+	bne,pn	%xcc, tsb_miss_itlb		! Miss
+	 mov	FAULT_CODE_ITLB, %g3
+	andcc	%g5, _PAGE_EXEC_4U, %g0		! Executable?
+	be,pn	%xcc, tsb_do_fault
+	 nop					! Delay slot, fill me
+	stxa	%g5, [%g0] ASI_ITLB_DATA_IN	! Load TLB
+	retry					! Trap done
+	nop
+
+/* ITLB ** ICACHE line 3: 				*/
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+
+/* ITLB ** ICACHE line 4: 				*/
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop

+ 224 - 139
arch/sparc64/kernel/ktlb.S

@@ -4,191 +4,276 @@
  * Copyright (C) 1996 Eddie C. Dost        (ecd@brainaid.de)
  * Copyright (C) 1996 Eddie C. Dost        (ecd@brainaid.de)
  * Copyright (C) 1996 Miguel de Icaza      (miguel@nuclecu.unam.mx)
  * Copyright (C) 1996 Miguel de Icaza      (miguel@nuclecu.unam.mx)
  * Copyright (C) 1996,98,99 Jakub Jelinek  (jj@sunsite.mff.cuni.cz)
  * Copyright (C) 1996,98,99 Jakub Jelinek  (jj@sunsite.mff.cuni.cz)
-*/
+ */
 
 
 #include <linux/config.h>
 #include <linux/config.h>
 #include <asm/head.h>
 #include <asm/head.h>
 #include <asm/asi.h>
 #include <asm/asi.h>
 #include <asm/page.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/pgtable.h>
+#include <asm/tsb.h>
 
 
 	.text
 	.text
 	.align		32
 	.align		32
 
 
-/*
- * On a second level vpte miss, check whether the original fault is to the OBP 
- * range (note that this is only possible for instruction miss, data misses to
- * obp range do not use vpte). If so, go back directly to the faulting address.
- * This is because we want to read the tpc, otherwise we have no way of knowing
- * the 8k aligned faulting address if we are using >8k kernel pagesize. This
- * also ensures no vpte range addresses are dropped into tlb while obp is
- * executing (see inherit_locked_prom_mappings() rant).
- */
-sparc64_vpte_nucleus:
-	/* Note that kvmap below has verified that the address is
-	 * in the range MODULES_VADDR --> VMALLOC_END already.  So
-	 * here we need only check if it is an OBP address or not.
+kvmap_itlb:
+	/* g6: TAG TARGET */
+	mov		TLB_TAG_ACCESS, %g4
+	ldxa		[%g4] ASI_IMMU, %g4
+
+	/* sun4v_itlb_miss branches here with the missing virtual
+	 * address already loaded into %g4
 	 */
 	 */
+kvmap_itlb_4v:
+
+kvmap_itlb_nonlinear:
+	/* Catch kernel NULL pointer calls.  */
+	sethi		%hi(PAGE_SIZE), %g5
+	cmp		%g4, %g5
+	bleu,pn		%xcc, kvmap_dtlb_longpath
+	 nop
+
+	KERN_TSB_LOOKUP_TL1(%g4, %g6, %g5, %g1, %g2, %g3, kvmap_itlb_load)
+
+kvmap_itlb_tsb_miss:
 	sethi		%hi(LOW_OBP_ADDRESS), %g5
 	sethi		%hi(LOW_OBP_ADDRESS), %g5
 	cmp		%g4, %g5
 	cmp		%g4, %g5
-	blu,pn		%xcc, kern_vpte
+	blu,pn		%xcc, kvmap_itlb_vmalloc_addr
 	 mov		0x1, %g5
 	 mov		0x1, %g5
 	sllx		%g5, 32, %g5
 	sllx		%g5, 32, %g5
 	cmp		%g4, %g5
 	cmp		%g4, %g5
-	blu,pn		%xcc, vpte_insn_obp
+	blu,pn		%xcc, kvmap_itlb_obp
 	 nop
 	 nop
 
 
-	/* These two instructions are patched by paginig_init().  */
-kern_vpte:
-	sethi		%hi(swapper_pgd_zero), %g5
-	lduw		[%g5 + %lo(swapper_pgd_zero)], %g5
+kvmap_itlb_vmalloc_addr:
+	KERN_PGTABLE_WALK(%g4, %g5, %g2, kvmap_itlb_longpath)
 
 
-	/* With kernel PGD in %g5, branch back into dtlb_backend.  */
-	ba,pt		%xcc, sparc64_kpte_continue
-	 andn		%g1, 0x3, %g1	/* Finish PMD offset adjustment.  */
+	KTSB_LOCK_TAG(%g1, %g2, %g7)
 
 
-vpte_noent:
-	/* Restore previous TAG_ACCESS, %g5 is zero, and we will
-	 * skip over the trap instruction so that the top level
-	 * TLB miss handler will thing this %g5 value is just an
-	 * invalid PTE, thus branching to full fault processing.
-	 */
-	mov		TLB_SFSR, %g1
-	stxa		%g4, [%g1 + %g1] ASI_DMMU
-	done
-
-vpte_insn_obp:
-	/* Behave as if we are at TL0.  */
-	wrpr		%g0, 1, %tl
-	rdpr		%tpc, %g4	/* Find original faulting iaddr */
-	srlx		%g4, 13, %g4	/* Throw out context bits */
-	sllx		%g4, 13, %g4	/* g4 has vpn + ctx0 now */
-
-	/* Restore previous TAG_ACCESS.  */
-	mov		TLB_SFSR, %g1
-	stxa		%g4, [%g1 + %g1] ASI_IMMU
-
-	sethi		%hi(prom_trans), %g5
-	or		%g5, %lo(prom_trans), %g5
-
-1:	ldx		[%g5 + 0x00], %g6	! base
-	brz,a,pn	%g6, longpath		! no more entries, fail
-	 mov		TLB_SFSR, %g1		! and restore %g1
-	ldx		[%g5 + 0x08], %g1	! len
-	add		%g6, %g1, %g1		! end
-	cmp		%g6, %g4
-	bgu,pt		%xcc, 2f
-	 cmp		%g4, %g1
-	bgeu,pt		%xcc, 2f
-	 ldx		[%g5 + 0x10], %g1	! PTE
-
-	/* TLB load, restore %g1, and return from trap.  */
-	sub		%g4, %g6, %g6
-	add		%g1, %g6, %g5
-	mov		TLB_SFSR, %g1
-	stxa		%g5, [%g0] ASI_ITLB_DATA_IN
-	retry
+	/* Load and check PTE.  */
+	ldxa		[%g5] ASI_PHYS_USE_EC, %g5
+	mov		1, %g7
+	sllx		%g7, TSB_TAG_INVALID_BIT, %g7
+	brgez,a,pn	%g5, kvmap_itlb_longpath
+	 KTSB_STORE(%g1, %g7)
+
+	KTSB_WRITE(%g1, %g5, %g6)
+
+	/* fallthrough to TLB load */
 
 
-2:	ba,pt		%xcc, 1b
-	 add		%g5, (3 * 8), %g5	! next entry
-
-kvmap_do_obp:
-	sethi		%hi(prom_trans), %g5
-	or		%g5, %lo(prom_trans), %g5
-	srlx		%g4, 13, %g4
-	sllx		%g4, 13, %g4
-
-1:	ldx		[%g5 + 0x00], %g6	! base
-	brz,a,pn	%g6, longpath		! no more entries, fail
-	 mov		TLB_SFSR, %g1		! and restore %g1
-	ldx		[%g5 + 0x08], %g1	! len
-	add		%g6, %g1, %g1		! end
-	cmp		%g6, %g4
-	bgu,pt		%xcc, 2f
-	 cmp		%g4, %g1
-	bgeu,pt		%xcc, 2f
-	 ldx		[%g5 + 0x10], %g1	! PTE
-
-	/* TLB load, restore %g1, and return from trap.  */
-	sub		%g4, %g6, %g6
-	add		%g1, %g6, %g5
-	mov		TLB_SFSR, %g1
-	stxa		%g5, [%g0] ASI_DTLB_DATA_IN
+kvmap_itlb_load:
+
+661:	stxa		%g5, [%g0] ASI_ITLB_DATA_IN
 	retry
 	retry
+	.section	.sun4v_2insn_patch, "ax"
+	.word		661b
+	nop
+	nop
+	.previous
+
+	/* For sun4v the ASI_ITLB_DATA_IN store and the retry
+	 * instruction get nop'd out and we get here to branch
+	 * to the sun4v tlb load code.  The registers are setup
+	 * as follows:
+	 *
+	 * %g4: vaddr
+	 * %g5: PTE
+	 * %g6:	TAG
+	 *
+	 * The sun4v TLB load wants the PTE in %g3 so we fix that
+	 * up here.
+	 */
+	ba,pt		%xcc, sun4v_itlb_load
+	 mov		%g5, %g3
 
 
-2:	ba,pt		%xcc, 1b
-	 add		%g5, (3 * 8), %g5	! next entry
+kvmap_itlb_longpath:
+
+661:	rdpr	%pstate, %g5
+	wrpr	%g5, PSTATE_AG | PSTATE_MG, %pstate
+	.section .sun4v_2insn_patch, "ax"
+	.word	661b
+	SET_GL(1)
+	nop
+	.previous
+
+	rdpr	%tpc, %g5
+	ba,pt	%xcc, sparc64_realfault_common
+	 mov	FAULT_CODE_ITLB, %g4
+
+kvmap_itlb_obp:
+	OBP_TRANS_LOOKUP(%g4, %g5, %g2, %g3, kvmap_itlb_longpath)
+
+	KTSB_LOCK_TAG(%g1, %g2, %g7)
+
+	KTSB_WRITE(%g1, %g5, %g6)
+
+	ba,pt		%xcc, kvmap_itlb_load
+	 nop
+
+kvmap_dtlb_obp:
+	OBP_TRANS_LOOKUP(%g4, %g5, %g2, %g3, kvmap_dtlb_longpath)
+
+	KTSB_LOCK_TAG(%g1, %g2, %g7)
+
+	KTSB_WRITE(%g1, %g5, %g6)
+
+	ba,pt		%xcc, kvmap_dtlb_load
+	 nop
 
 
-/*
- * On a first level data miss, check whether this is to the OBP range (note
- * that such accesses can be made by prom, as well as by kernel using
- * prom_getproperty on "address"), and if so, do not use vpte access ...
- * rather, use information saved during inherit_prom_mappings() using 8k
- * pagesize.
- */
 	.align		32
 	.align		32
-kvmap:
-	brgez,pn	%g4, kvmap_nonlinear
+kvmap_dtlb_tsb4m_load:
+	KTSB_LOCK_TAG(%g1, %g2, %g7)
+	KTSB_WRITE(%g1, %g5, %g6)
+	ba,pt		%xcc, kvmap_dtlb_load
 	 nop
 	 nop
 
 
-#ifdef CONFIG_DEBUG_PAGEALLOC
+kvmap_dtlb:
+	/* %g6: TAG TARGET */
+	mov		TLB_TAG_ACCESS, %g4
+	ldxa		[%g4] ASI_DMMU, %g4
+
+	/* sun4v_dtlb_miss branches here with the missing virtual
+	 * address already loaded into %g4
+	 */
+kvmap_dtlb_4v:
+	brgez,pn	%g4, kvmap_dtlb_nonlinear
+	 nop
+
+	/* Correct TAG_TARGET is already in %g6, check 4mb TSB.  */
+	KERN_TSB4M_LOOKUP_TL1(%g6, %g5, %g1, %g2, %g3, kvmap_dtlb_load)
+
+	/* TSB entry address left in %g1, lookup linear PTE.
+	 * Must preserve %g1 and %g6 (TAG).
+	 */
+kvmap_dtlb_tsb4m_miss:
+	sethi		%hi(kpte_linear_bitmap), %g2
+	or		%g2, %lo(kpte_linear_bitmap), %g2
+
+	/* Clear the PAGE_OFFSET top virtual bits, then shift
+	 * down to get a 256MB physical address index.
+	 */
+	sllx		%g4, 21, %g5
+	mov		1, %g7
+	srlx		%g5, 21 + 28, %g5
+
+	/* Don't try this at home kids... this depends upon srlx
+	 * only taking the low 6 bits of the shift count in %g5.
+	 */
+	sllx		%g7, %g5, %g7
+
+	/* Divide by 64 to get the offset into the bitmask.  */
+	srlx		%g5, 6, %g5
+	sllx		%g5, 3, %g5
+
+	/* kern_linear_pte_xor[((mask & bit) ? 1 : 0)] */
+	ldx		[%g2 + %g5], %g2
+	andcc		%g2, %g7, %g0
+	sethi		%hi(kern_linear_pte_xor), %g5
+	or		%g5, %lo(kern_linear_pte_xor), %g5
+	bne,a,pt	%xcc, 1f
+	 add		%g5, 8, %g5
+
+1:	ldx		[%g5], %g2
+
 	.globl		kvmap_linear_patch
 	.globl		kvmap_linear_patch
 kvmap_linear_patch:
 kvmap_linear_patch:
-#endif
-	ba,pt		%xcc, kvmap_load
+	ba,pt		%xcc, kvmap_dtlb_tsb4m_load
 	 xor		%g2, %g4, %g5
 	 xor		%g2, %g4, %g5
 
 
-#ifdef CONFIG_DEBUG_PAGEALLOC
-	sethi		%hi(swapper_pg_dir), %g5
-	or		%g5, %lo(swapper_pg_dir), %g5
-	sllx		%g4, 64 - (PGDIR_SHIFT + PGDIR_BITS), %g6
-	srlx		%g6, 64 - PAGE_SHIFT, %g6
-	andn		%g6, 0x3, %g6
-	lduw		[%g5 + %g6], %g5
-	brz,pn		%g5, longpath
-	 sllx		%g4, 64 - (PMD_SHIFT + PMD_BITS), %g6
-	srlx		%g6, 64 - PAGE_SHIFT, %g6
-	sllx		%g5, 11, %g5
-	andn		%g6, 0x3, %g6
-	lduwa		[%g5 + %g6] ASI_PHYS_USE_EC, %g5
-	brz,pn		%g5, longpath
-	 sllx		%g4, 64 - PMD_SHIFT, %g6
-	srlx		%g6, 64 - PAGE_SHIFT, %g6
-	sllx		%g5, 11, %g5
-	andn		%g6, 0x7, %g6
-	ldxa		[%g5 + %g6] ASI_PHYS_USE_EC, %g5
-	brz,pn		%g5, longpath
+kvmap_dtlb_vmalloc_addr:
+	KERN_PGTABLE_WALK(%g4, %g5, %g2, kvmap_dtlb_longpath)
+
+	KTSB_LOCK_TAG(%g1, %g2, %g7)
+
+	/* Load and check PTE.  */
+	ldxa		[%g5] ASI_PHYS_USE_EC, %g5
+	mov		1, %g7
+	sllx		%g7, TSB_TAG_INVALID_BIT, %g7
+	brgez,a,pn	%g5, kvmap_dtlb_longpath
+	 KTSB_STORE(%g1, %g7)
+
+	KTSB_WRITE(%g1, %g5, %g6)
+
+	/* fallthrough to TLB load */
+
+kvmap_dtlb_load:
+
+661:	stxa		%g5, [%g0] ASI_DTLB_DATA_IN	! Reload TLB
+	retry
+	.section	.sun4v_2insn_patch, "ax"
+	.word		661b
+	nop
+	nop
+	.previous
+
+	/* For sun4v the ASI_DTLB_DATA_IN store and the retry
+	 * instruction get nop'd out and we get here to branch
+	 * to the sun4v tlb load code.  The registers are setup
+	 * as follows:
+	 *
+	 * %g4: vaddr
+	 * %g5: PTE
+	 * %g6:	TAG
+	 *
+	 * The sun4v TLB load wants the PTE in %g3 so we fix that
+	 * up here.
+	 */
+	ba,pt		%xcc, sun4v_dtlb_load
+	 mov		%g5, %g3
+
+kvmap_dtlb_nonlinear:
+	/* Catch kernel NULL pointer derefs.  */
+	sethi		%hi(PAGE_SIZE), %g5
+	cmp		%g4, %g5
+	bleu,pn		%xcc, kvmap_dtlb_longpath
 	 nop
 	 nop
-	ba,a,pt		%xcc, kvmap_load
-#endif
 
 
-kvmap_nonlinear:
+	KERN_TSB_LOOKUP_TL1(%g4, %g6, %g5, %g1, %g2, %g3, kvmap_dtlb_load)
+
+kvmap_dtlb_tsbmiss:
 	sethi		%hi(MODULES_VADDR), %g5
 	sethi		%hi(MODULES_VADDR), %g5
 	cmp		%g4, %g5
 	cmp		%g4, %g5
-	blu,pn		%xcc, longpath
+	blu,pn		%xcc, kvmap_dtlb_longpath
 	 mov		(VMALLOC_END >> 24), %g5
 	 mov		(VMALLOC_END >> 24), %g5
 	sllx		%g5, 24, %g5
 	sllx		%g5, 24, %g5
 	cmp		%g4, %g5
 	cmp		%g4, %g5
-	bgeu,pn		%xcc, longpath
+	bgeu,pn		%xcc, kvmap_dtlb_longpath
 	 nop
 	 nop
 
 
 kvmap_check_obp:
 kvmap_check_obp:
 	sethi		%hi(LOW_OBP_ADDRESS), %g5
 	sethi		%hi(LOW_OBP_ADDRESS), %g5
 	cmp		%g4, %g5
 	cmp		%g4, %g5
-	blu,pn		%xcc, kvmap_vmalloc_addr
+	blu,pn		%xcc, kvmap_dtlb_vmalloc_addr
 	 mov		0x1, %g5
 	 mov		0x1, %g5
 	sllx		%g5, 32, %g5
 	sllx		%g5, 32, %g5
 	cmp		%g4, %g5
 	cmp		%g4, %g5
-	blu,pn		%xcc, kvmap_do_obp
+	blu,pn		%xcc, kvmap_dtlb_obp
 	 nop
 	 nop
-
-kvmap_vmalloc_addr:
-	/* If we get here, a vmalloc addr was accessed, load kernel VPTE.  */
-	ldxa		[%g3 + %g6] ASI_N, %g5
-	brgez,pn	%g5, longpath
+	ba,pt		%xcc, kvmap_dtlb_vmalloc_addr
 	 nop
 	 nop
 
 
-kvmap_load:
-	/* PTE is valid, load into TLB and return from trap.  */
-	stxa		%g5, [%g0] ASI_DTLB_DATA_IN	! Reload TLB
-	retry
+kvmap_dtlb_longpath:
+
+661:	rdpr	%pstate, %g5
+	wrpr	%g5, PSTATE_AG | PSTATE_MG, %pstate
+	.section .sun4v_2insn_patch, "ax"
+	.word	661b
+	SET_GL(1)
+	ldxa		[%g0] ASI_SCRATCHPAD, %g5
+	.previous
+
+	rdpr	%tl, %g3
+	cmp	%g3, 1
+
+661:	mov	TLB_TAG_ACCESS, %g4
+	ldxa	[%g4] ASI_DMMU, %g5
+	.section .sun4v_2insn_patch, "ax"
+	.word	661b
+	ldx	[%g5 + HV_FAULT_D_ADDR_OFFSET], %g5
+	nop
+	.previous
+
+	be,pt	%xcc, sparc64_realfault_common
+	 mov	FAULT_CODE_DTLB, %g4
+	ba,pt	%xcc, winfix_trampoline
+	 nop

+ 13 - 0
arch/sparc64/kernel/pci.c

@@ -188,6 +188,7 @@ extern void psycho_init(int, char *);
 extern void schizo_init(int, char *);
 extern void schizo_init(int, char *);
 extern void schizo_plus_init(int, char *);
 extern void schizo_plus_init(int, char *);
 extern void tomatillo_init(int, char *);
 extern void tomatillo_init(int, char *);
+extern void sun4v_pci_init(int, char *);
 
 
 static struct {
 static struct {
 	char *model_name;
 	char *model_name;
@@ -204,6 +205,7 @@ static struct {
 	{ "pci108e,8002", schizo_plus_init },
 	{ "pci108e,8002", schizo_plus_init },
 	{ "SUNW,tomatillo", tomatillo_init },
 	{ "SUNW,tomatillo", tomatillo_init },
 	{ "pci108e,a801", tomatillo_init },
 	{ "pci108e,a801", tomatillo_init },
+	{ "SUNW,sun4v-pci", sun4v_pci_init },
 };
 };
 #define PCI_NUM_CONTROLLER_TYPES (sizeof(pci_controller_table) / \
 #define PCI_NUM_CONTROLLER_TYPES (sizeof(pci_controller_table) / \
 				  sizeof(pci_controller_table[0]))
 				  sizeof(pci_controller_table[0]))
@@ -283,6 +285,12 @@ int __init pcic_present(void)
 	return pci_controller_scan(pci_is_controller);
 	return pci_controller_scan(pci_is_controller);
 }
 }
 
 
+struct pci_iommu_ops *pci_iommu_ops;
+EXPORT_SYMBOL(pci_iommu_ops);
+
+extern struct pci_iommu_ops pci_sun4u_iommu_ops,
+	pci_sun4v_iommu_ops;
+
 /* Find each controller in the system, attach and initialize
 /* Find each controller in the system, attach and initialize
  * software state structure for each and link into the
  * software state structure for each and link into the
  * pci_controller_root.  Setup the controller enough such
  * pci_controller_root.  Setup the controller enough such
@@ -290,6 +298,11 @@ int __init pcic_present(void)
  */
  */
 static void __init pci_controller_probe(void)
 static void __init pci_controller_probe(void)
 {
 {
+	if (tlb_type == hypervisor)
+		pci_iommu_ops = &pci_sun4v_iommu_ops;
+	else
+		pci_iommu_ops = &pci_sun4u_iommu_ops;
+
 	printk("PCI: Probing for controllers.\n");
 	printk("PCI: Probing for controllers.\n");
 
 
 	pci_controller_scan(pci_controller_init);
 	pci_controller_scan(pci_controller_init);

+ 174 - 127
arch/sparc64/kernel/pci_common.c

@@ -39,6 +39,8 @@ static int __init find_device_prom_node(struct pci_pbm_info *pbm,
 {
 {
 	int node;
 	int node;
 
 
+	*nregs = 0;
+
 	/*
 	/*
 	 * Return the PBM's PROM node in case we are it's PCI device,
 	 * Return the PBM's PROM node in case we are it's PCI device,
 	 * as the PBM's reg property is different to standard PCI reg
 	 * as the PBM's reg property is different to standard PCI reg
@@ -51,10 +53,8 @@ static int __init find_device_prom_node(struct pci_pbm_info *pbm,
 	     pdev->device == PCI_DEVICE_ID_SUN_SCHIZO ||
 	     pdev->device == PCI_DEVICE_ID_SUN_SCHIZO ||
 	     pdev->device == PCI_DEVICE_ID_SUN_TOMATILLO ||
 	     pdev->device == PCI_DEVICE_ID_SUN_TOMATILLO ||
 	     pdev->device == PCI_DEVICE_ID_SUN_SABRE ||
 	     pdev->device == PCI_DEVICE_ID_SUN_SABRE ||
-	     pdev->device == PCI_DEVICE_ID_SUN_HUMMINGBIRD)) {
-		*nregs = 0;
+	     pdev->device == PCI_DEVICE_ID_SUN_HUMMINGBIRD))
 		return bus_prom_node;
 		return bus_prom_node;
-	}
 
 
 	node = prom_getchild(bus_prom_node);
 	node = prom_getchild(bus_prom_node);
 	while (node != 0) {
 	while (node != 0) {
@@ -541,135 +541,183 @@ void __init pci_assign_unassigned(struct pci_pbm_info *pbm,
 		pci_assign_unassigned(pbm, bus);
 		pci_assign_unassigned(pbm, bus);
 }
 }
 
 
-static int __init pci_intmap_match(struct pci_dev *pdev, unsigned int *interrupt)
+static inline unsigned int pci_slot_swivel(struct pci_pbm_info *pbm,
+					   struct pci_dev *toplevel_pdev,
+					   struct pci_dev *pdev,
+					   unsigned int interrupt)
 {
 {
-	struct linux_prom_pci_intmap bridge_local_intmap[PROM_PCIIMAP_MAX], *intmap;
-	struct linux_prom_pci_intmask bridge_local_intmask, *intmask;
-	struct pcidev_cookie *dev_pcp = pdev->sysdata;
-	struct pci_pbm_info *pbm = dev_pcp->pbm;
-	struct linux_prom_pci_registers *pregs = dev_pcp->prom_regs;
-	unsigned int hi, mid, lo, irq;
-	int i, num_intmap, map_slot;
+	unsigned int ret;
 
 
-	intmap = &pbm->pbm_intmap[0];
-	intmask = &pbm->pbm_intmask;
-	num_intmap = pbm->num_pbm_intmap;
-	map_slot = 0;
+	if (unlikely(interrupt < 1 || interrupt > 4)) {
+		printk("%s: Device %s interrupt value of %u is strange.\n",
+		       pbm->name, pci_name(pdev), interrupt);
+		return interrupt;
+	}
 
 
-	/* If we are underneath a PCI bridge, use PROM register
-	 * property of the parent bridge which is closest to
-	 * the PBM.
-	 *
-	 * However if that parent bridge has interrupt map/mask
-	 * properties of its own we use the PROM register property
-	 * of the next child device on the path to PDEV.
-	 *
-	 * In detail the two cases are (note that the 'X' below is the
-	 * 'next child on the path to PDEV' mentioned above):
-	 *
-	 * 1) PBM --> PCI bus lacking int{map,mask} --> X ... PDEV
-	 *
-	 *    Here we use regs of 'PCI bus' device.
-	 *
-	 * 2) PBM --> PCI bus with int{map,mask} --> X ... PDEV
-	 *
-	 *    Here we use regs of 'X'.  Note that X can be PDEV.
-	 */
-	if (pdev->bus->number != pbm->pci_first_busno) {
-		struct pcidev_cookie *bus_pcp, *regs_pcp;
-		struct pci_dev *bus_dev, *regs_dev;
-		int plen;
+	ret = ((interrupt - 1 + (PCI_SLOT(pdev->devfn) & 3)) & 3) + 1;
+
+	printk("%s: %s IRQ Swivel %s [%x:%x] -> [%x]\n",
+	       pbm->name, pci_name(toplevel_pdev), pci_name(pdev),
+	       interrupt, PCI_SLOT(pdev->devfn), ret);
+
+	return ret;
+}
+
+static inline unsigned int pci_apply_intmap(struct pci_pbm_info *pbm,
+					    struct pci_dev *toplevel_pdev,
+					    struct pci_dev *pbus,
+					    struct pci_dev *pdev,
+					    unsigned int interrupt,
+					    unsigned int *cnode)
+{
+	struct linux_prom_pci_intmap imap[PROM_PCIIMAP_MAX];
+	struct linux_prom_pci_intmask imask;
+	struct pcidev_cookie *pbus_pcp = pbus->sysdata;
+	struct pcidev_cookie *pdev_pcp = pdev->sysdata;
+	struct linux_prom_pci_registers *pregs = pdev_pcp->prom_regs;
+	int plen, num_imap, i;
+	unsigned int hi, mid, lo, irq, orig_interrupt;
+
+	*cnode = pbus_pcp->prom_node;
+
+	plen = prom_getproperty(pbus_pcp->prom_node, "interrupt-map",
+				(char *) &imap[0], sizeof(imap));
+	if (plen <= 0 ||
+	    (plen % sizeof(struct linux_prom_pci_intmap)) != 0) {
+		printk("%s: Device %s interrupt-map has bad len %d\n",
+		       pbm->name, pci_name(pbus), plen);
+		goto no_intmap;
+	}
+	num_imap = plen / sizeof(struct linux_prom_pci_intmap);
+
+	plen = prom_getproperty(pbus_pcp->prom_node, "interrupt-map-mask",
+				(char *) &imask, sizeof(imask));
+	if (plen <= 0 ||
+	    (plen % sizeof(struct linux_prom_pci_intmask)) != 0) {
+		printk("%s: Device %s interrupt-map-mask has bad len %d\n",
+		       pbm->name, pci_name(pbus), plen);
+		goto no_intmap;
+	}
+
+	orig_interrupt = interrupt;
 
 
-		bus_dev = pdev->bus->self;
-		regs_dev = pdev;
+	hi   = pregs->phys_hi & imask.phys_hi;
+	mid  = pregs->phys_mid & imask.phys_mid;
+	lo   = pregs->phys_lo & imask.phys_lo;
+	irq  = interrupt & imask.interrupt;
 
 
-		while (bus_dev->bus &&
-		       bus_dev->bus->number != pbm->pci_first_busno) {
-			regs_dev = bus_dev;
-			bus_dev = bus_dev->bus->self;
+	for (i = 0; i < num_imap; i++) {
+		if (imap[i].phys_hi  == hi   &&
+		    imap[i].phys_mid == mid  &&
+		    imap[i].phys_lo  == lo   &&
+		    imap[i].interrupt == irq) {
+			*cnode = imap[i].cnode;
+			interrupt = imap[i].cinterrupt;
 		}
 		}
+	}
 
 
-		regs_pcp = regs_dev->sysdata;
-		pregs = regs_pcp->prom_regs;
+	printk("%s: %s MAP BUS %s DEV %s [%x] -> [%x]\n",
+	       pbm->name, pci_name(toplevel_pdev),
+	       pci_name(pbus), pci_name(pdev),
+	       orig_interrupt, interrupt);
 
 
-		bus_pcp = bus_dev->sysdata;
+no_intmap:
+	return interrupt;
+}
 
 
-		/* But if the PCI bridge has it's own interrupt map
-		 * and mask properties, use that and the regs of the
-		 * PCI entity at the next level down on the path to the
-		 * device.
-		 */
-		plen = prom_getproperty(bus_pcp->prom_node, "interrupt-map",
-					(char *) &bridge_local_intmap[0],
-					sizeof(bridge_local_intmap));
-		if (plen != -1) {
-			intmap = &bridge_local_intmap[0];
-			num_intmap = plen / sizeof(struct linux_prom_pci_intmap);
-			plen = prom_getproperty(bus_pcp->prom_node,
-						"interrupt-map-mask",
-						(char *) &bridge_local_intmask,
-						sizeof(bridge_local_intmask));
-			if (plen == -1) {
-				printk("pci_intmap_match: Warning! Bridge has intmap "
-				       "but no intmask.\n");
-				printk("pci_intmap_match: Trying to recover.\n");
-				return 0;
-			}
+/* For each PCI bus on the way to the root:
+ * 1) If it has an interrupt-map property, apply it.
+ * 2) Else, swivel the interrupt number based upon the PCI device number.
+ *
+ * Return the "IRQ controller" node.  If this is the PBM's device node,
+ * all interrupt translations are complete, else we should use that node's
+ * "reg" property to apply the PBM's "interrupt-{map,mask}" to the interrupt.
+ */
+static unsigned int __init pci_intmap_match_to_root(struct pci_pbm_info *pbm,
+						    struct pci_dev *pdev,
+						    unsigned int *interrupt)
+{
+	struct pci_dev *toplevel_pdev = pdev;
+	struct pcidev_cookie *toplevel_pcp = toplevel_pdev->sysdata;
+	unsigned int cnode = toplevel_pcp->prom_node;
+
+	while (pdev->bus->number != pbm->pci_first_busno) {
+		struct pci_dev *pbus = pdev->bus->self;
+		struct pcidev_cookie *pcp = pbus->sysdata;
+		int plen;
 
 
-			if (pdev->bus->self != bus_dev)
-				map_slot = 1;
+		plen = prom_getproplen(pcp->prom_node, "interrupt-map");
+		if (plen <= 0) {
+			*interrupt = pci_slot_swivel(pbm, toplevel_pdev,
+						     pdev, *interrupt);
+			cnode = pcp->prom_node;
 		} else {
 		} else {
-			pregs = bus_pcp->prom_regs;
-			map_slot = 1;
+			*interrupt = pci_apply_intmap(pbm, toplevel_pdev,
+						      pbus, pdev,
+						      *interrupt, &cnode);
+
+			while (pcp->prom_node != cnode &&
+			       pbus->bus->number != pbm->pci_first_busno) {
+				pbus = pbus->bus->self;
+				pcp = pbus->sysdata;
+			}
 		}
 		}
-	}
+		pdev = pbus;
 
 
-	if (map_slot) {
-		*interrupt = ((*interrupt
-			       - 1
-			       + PCI_SLOT(pdev->devfn)) & 0x3) + 1;
+		if (cnode == pbm->prom_node)
+			break;
 	}
 	}
 
 
-	hi   = pregs->phys_hi & intmask->phys_hi;
-	mid  = pregs->phys_mid & intmask->phys_mid;
-	lo   = pregs->phys_lo & intmask->phys_lo;
-	irq  = *interrupt & intmask->interrupt;
-
-	for (i = 0; i < num_intmap; i++) {
-		if (intmap[i].phys_hi  == hi	&&
-		    intmap[i].phys_mid == mid	&&
-		    intmap[i].phys_lo  == lo	&&
-		    intmap[i].interrupt == irq) {
-			*interrupt = intmap[i].cinterrupt;
-			printk("PCI-IRQ: Routing bus[%2x] slot[%2x] map[%d] to INO[%02x]\n",
-			       pdev->bus->number, PCI_SLOT(pdev->devfn),
-			       map_slot, *interrupt);
-			return 1;
-		}
+	return cnode;
+}
+
+static int __init pci_intmap_match(struct pci_dev *pdev, unsigned int *interrupt)
+{
+	struct pcidev_cookie *dev_pcp = pdev->sysdata;
+	struct pci_pbm_info *pbm = dev_pcp->pbm;
+	struct linux_prom_pci_registers reg[PROMREG_MAX];
+	unsigned int hi, mid, lo, irq;
+	int i, cnode, plen;
+
+	cnode = pci_intmap_match_to_root(pbm, pdev, interrupt);
+	if (cnode == pbm->prom_node)
+		goto success;
+
+	plen = prom_getproperty(cnode, "reg", (char *) reg, sizeof(reg));
+	if (plen <= 0 ||
+	    (plen % sizeof(struct linux_prom_pci_registers)) != 0) {
+		printk("%s: OBP node %x reg property has bad len %d\n",
+		       pbm->name, cnode, plen);
+		goto fail;
 	}
 	}
 
 
-	/* We will run this code even if pbm->num_pbm_intmap is zero, just so
-	 * we can apply the slot mapping to the PROM interrupt property value.
-	 * So do not spit out these warnings in that case.
-	 */
-	if (num_intmap != 0) {
-		/* Print it both to OBP console and kernel one so that if bootup
-		 * hangs here the user has the information to report.
-		 */
-		prom_printf("pci_intmap_match: bus %02x, devfn %02x: ",
-			    pdev->bus->number, pdev->devfn);
-		prom_printf("IRQ [%08x.%08x.%08x.%08x] not found in interrupt-map\n",
-			    pregs->phys_hi, pregs->phys_mid, pregs->phys_lo, *interrupt);
-		prom_printf("Please email this information to davem@redhat.com\n");
-
-		printk("pci_intmap_match: bus %02x, devfn %02x: ",
-		       pdev->bus->number, pdev->devfn);
-		printk("IRQ [%08x.%08x.%08x.%08x] not found in interrupt-map\n",
-		       pregs->phys_hi, pregs->phys_mid, pregs->phys_lo, *interrupt);
-		printk("Please email this information to davem@redhat.com\n");
+	hi   = reg[0].phys_hi & pbm->pbm_intmask.phys_hi;
+	mid  = reg[0].phys_mid & pbm->pbm_intmask.phys_mid;
+	lo   = reg[0].phys_lo & pbm->pbm_intmask.phys_lo;
+	irq  = *interrupt & pbm->pbm_intmask.interrupt;
+
+	for (i = 0; i < pbm->num_pbm_intmap; i++) {
+		struct linux_prom_pci_intmap *intmap;
+
+		intmap = &pbm->pbm_intmap[i];
+
+		if (intmap->phys_hi  == hi  &&
+		    intmap->phys_mid == mid &&
+		    intmap->phys_lo  == lo  &&
+		    intmap->interrupt == irq) {
+			*interrupt = intmap->cinterrupt;
+			goto success;
+		}
 	}
 	}
 
 
+fail:
 	return 0;
 	return 0;
+
+success:
+	printk("PCI-IRQ: Routing bus[%2x] slot[%2x] to INO[%02x]\n",
+	       pdev->bus->number, PCI_SLOT(pdev->devfn),
+	       *interrupt);
+	return 1;
 }
 }
 
 
 static void __init pdev_fixup_irq(struct pci_dev *pdev)
 static void __init pdev_fixup_irq(struct pci_dev *pdev)
@@ -703,16 +751,18 @@ static void __init pdev_fixup_irq(struct pci_dev *pdev)
 		return;
 		return;
 	}
 	}
 
 
-	/* Fully specified already? */
-	if (((prom_irq & PCI_IRQ_IGN) >> 6) == portid) {
-		pdev->irq = p->irq_build(pbm, pdev, prom_irq);
-		goto have_irq;
-	}
+	if (tlb_type != hypervisor) {
+		/* Fully specified already? */
+		if (((prom_irq & PCI_IRQ_IGN) >> 6) == portid) {
+			pdev->irq = p->irq_build(pbm, pdev, prom_irq);
+			goto have_irq;
+		}
 
 
-	/* An onboard device? (bit 5 set) */
-	if ((prom_irq & PCI_IRQ_INO) & 0x20) {
-		pdev->irq = p->irq_build(pbm, pdev, (portid << 6 | prom_irq));
-		goto have_irq;
+		/* An onboard device? (bit 5 set) */
+		if ((prom_irq & PCI_IRQ_INO) & 0x20) {
+			pdev->irq = p->irq_build(pbm, pdev, (portid << 6 | prom_irq));
+			goto have_irq;
+		}
 	}
 	}
 
 
 	/* Can we find a matching entry in the interrupt-map? */
 	/* Can we find a matching entry in the interrupt-map? */
@@ -927,33 +977,30 @@ void pci_register_legacy_regions(struct resource *io_res,
 	struct resource *p;
 	struct resource *p;
 
 
 	/* VGA Video RAM. */
 	/* VGA Video RAM. */
-	p = kmalloc(sizeof(*p), GFP_KERNEL);
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
 	if (!p)
 	if (!p)
 		return;
 		return;
 
 
-	memset(p, 0, sizeof(*p));
 	p->name = "Video RAM area";
 	p->name = "Video RAM area";
 	p->start = mem_res->start + 0xa0000UL;
 	p->start = mem_res->start + 0xa0000UL;
 	p->end = p->start + 0x1ffffUL;
 	p->end = p->start + 0x1ffffUL;
 	p->flags = IORESOURCE_BUSY;
 	p->flags = IORESOURCE_BUSY;
 	request_resource(mem_res, p);
 	request_resource(mem_res, p);
 
 
-	p = kmalloc(sizeof(*p), GFP_KERNEL);
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
 	if (!p)
 	if (!p)
 		return;
 		return;
 
 
-	memset(p, 0, sizeof(*p));
 	p->name = "System ROM";
 	p->name = "System ROM";
 	p->start = mem_res->start + 0xf0000UL;
 	p->start = mem_res->start + 0xf0000UL;
 	p->end = p->start + 0xffffUL;
 	p->end = p->start + 0xffffUL;
 	p->flags = IORESOURCE_BUSY;
 	p->flags = IORESOURCE_BUSY;
 	request_resource(mem_res, p);
 	request_resource(mem_res, p);
 
 
-	p = kmalloc(sizeof(*p), GFP_KERNEL);
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
 	if (!p)
 	if (!p)
 		return;
 		return;
 
 
-	memset(p, 0, sizeof(*p));
 	p->name = "Video ROM";
 	p->name = "Video ROM";
 	p->start = mem_res->start + 0xc0000UL;
 	p->start = mem_res->start + 0xc0000UL;
 	p->end = p->start + 0x7fffUL;
 	p->end = p->start + 0x7fffUL;

+ 23 - 13
arch/sparc64/kernel/pci_iommu.c

@@ -139,12 +139,11 @@ void pci_iommu_table_init(struct pci_iommu *iommu, int tsbsize, u32 dma_offset,
 	/* Allocate and initialize the free area map.  */
 	/* Allocate and initialize the free area map.  */
 	sz = num_tsb_entries / 8;
 	sz = num_tsb_entries / 8;
 	sz = (sz + 7UL) & ~7UL;
 	sz = (sz + 7UL) & ~7UL;
-	iommu->arena.map = kmalloc(sz, GFP_KERNEL);
+	iommu->arena.map = kzalloc(sz, GFP_KERNEL);
 	if (!iommu->arena.map) {
 	if (!iommu->arena.map) {
 		prom_printf("PCI_IOMMU: Error, kmalloc(arena.map) failed.\n");
 		prom_printf("PCI_IOMMU: Error, kmalloc(arena.map) failed.\n");
 		prom_halt();
 		prom_halt();
 	}
 	}
-	memset(iommu->arena.map, 0, sz);
 	iommu->arena.limit = num_tsb_entries;
 	iommu->arena.limit = num_tsb_entries;
 
 
 	/* Allocate and initialize the dummy page which we
 	/* Allocate and initialize the dummy page which we
@@ -219,7 +218,7 @@ static inline void iommu_free_ctx(struct pci_iommu *iommu, int ctx)
  * DMA for PCI device PDEV.  Return non-NULL cpu-side address if
  * DMA for PCI device PDEV.  Return non-NULL cpu-side address if
  * successful and set *DMA_ADDRP to the PCI side dma address.
  * successful and set *DMA_ADDRP to the PCI side dma address.
  */
  */
-void *pci_alloc_consistent(struct pci_dev *pdev, size_t size, dma_addr_t *dma_addrp)
+static void *pci_4u_alloc_consistent(struct pci_dev *pdev, size_t size, dma_addr_t *dma_addrp)
 {
 {
 	struct pcidev_cookie *pcp;
 	struct pcidev_cookie *pcp;
 	struct pci_iommu *iommu;
 	struct pci_iommu *iommu;
@@ -267,7 +266,7 @@ void *pci_alloc_consistent(struct pci_dev *pdev, size_t size, dma_addr_t *dma_ad
 }
 }
 
 
 /* Free and unmap a consistent DMA translation. */
 /* Free and unmap a consistent DMA translation. */
-void pci_free_consistent(struct pci_dev *pdev, size_t size, void *cpu, dma_addr_t dvma)
+static void pci_4u_free_consistent(struct pci_dev *pdev, size_t size, void *cpu, dma_addr_t dvma)
 {
 {
 	struct pcidev_cookie *pcp;
 	struct pcidev_cookie *pcp;
 	struct pci_iommu *iommu;
 	struct pci_iommu *iommu;
@@ -294,7 +293,7 @@ void pci_free_consistent(struct pci_dev *pdev, size_t size, void *cpu, dma_addr_
 /* Map a single buffer at PTR of SZ bytes for PCI DMA
 /* Map a single buffer at PTR of SZ bytes for PCI DMA
  * in streaming mode.
  * in streaming mode.
  */
  */
-dma_addr_t pci_map_single(struct pci_dev *pdev, void *ptr, size_t sz, int direction)
+static dma_addr_t pci_4u_map_single(struct pci_dev *pdev, void *ptr, size_t sz, int direction)
 {
 {
 	struct pcidev_cookie *pcp;
 	struct pcidev_cookie *pcp;
 	struct pci_iommu *iommu;
 	struct pci_iommu *iommu;
@@ -415,7 +414,7 @@ do_flush_sync:
 }
 }
 
 
 /* Unmap a single streaming mode DMA translation. */
 /* Unmap a single streaming mode DMA translation. */
-void pci_unmap_single(struct pci_dev *pdev, dma_addr_t bus_addr, size_t sz, int direction)
+static void pci_4u_unmap_single(struct pci_dev *pdev, dma_addr_t bus_addr, size_t sz, int direction)
 {
 {
 	struct pcidev_cookie *pcp;
 	struct pcidev_cookie *pcp;
 	struct pci_iommu *iommu;
 	struct pci_iommu *iommu;
@@ -548,7 +547,7 @@ static inline void fill_sg(iopte_t *iopte, struct scatterlist *sg,
  * When making changes here, inspect the assembly output. I was having
  * When making changes here, inspect the assembly output. I was having
  * hard time to kepp this routine out of using stack slots for holding variables.
  * hard time to kepp this routine out of using stack slots for holding variables.
  */
  */
-int pci_map_sg(struct pci_dev *pdev, struct scatterlist *sglist, int nelems, int direction)
+static int pci_4u_map_sg(struct pci_dev *pdev, struct scatterlist *sglist, int nelems, int direction)
 {
 {
 	struct pcidev_cookie *pcp;
 	struct pcidev_cookie *pcp;
 	struct pci_iommu *iommu;
 	struct pci_iommu *iommu;
@@ -562,9 +561,9 @@ int pci_map_sg(struct pci_dev *pdev, struct scatterlist *sglist, int nelems, int
 	/* Fast path single entry scatterlists. */
 	/* Fast path single entry scatterlists. */
 	if (nelems == 1) {
 	if (nelems == 1) {
 		sglist->dma_address =
 		sglist->dma_address =
-			pci_map_single(pdev,
-				       (page_address(sglist->page) + sglist->offset),
-				       sglist->length, direction);
+			pci_4u_map_single(pdev,
+					  (page_address(sglist->page) + sglist->offset),
+					  sglist->length, direction);
 		if (unlikely(sglist->dma_address == PCI_DMA_ERROR_CODE))
 		if (unlikely(sglist->dma_address == PCI_DMA_ERROR_CODE))
 			return 0;
 			return 0;
 		sglist->dma_length = sglist->length;
 		sglist->dma_length = sglist->length;
@@ -635,7 +634,7 @@ bad_no_ctx:
 }
 }
 
 
 /* Unmap a set of streaming mode DMA translations. */
 /* Unmap a set of streaming mode DMA translations. */
-void pci_unmap_sg(struct pci_dev *pdev, struct scatterlist *sglist, int nelems, int direction)
+static void pci_4u_unmap_sg(struct pci_dev *pdev, struct scatterlist *sglist, int nelems, int direction)
 {
 {
 	struct pcidev_cookie *pcp;
 	struct pcidev_cookie *pcp;
 	struct pci_iommu *iommu;
 	struct pci_iommu *iommu;
@@ -695,7 +694,7 @@ void pci_unmap_sg(struct pci_dev *pdev, struct scatterlist *sglist, int nelems,
 /* Make physical memory consistent for a single
 /* Make physical memory consistent for a single
  * streaming mode DMA translation after a transfer.
  * streaming mode DMA translation after a transfer.
  */
  */
-void pci_dma_sync_single_for_cpu(struct pci_dev *pdev, dma_addr_t bus_addr, size_t sz, int direction)
+static void pci_4u_dma_sync_single_for_cpu(struct pci_dev *pdev, dma_addr_t bus_addr, size_t sz, int direction)
 {
 {
 	struct pcidev_cookie *pcp;
 	struct pcidev_cookie *pcp;
 	struct pci_iommu *iommu;
 	struct pci_iommu *iommu;
@@ -735,7 +734,7 @@ void pci_dma_sync_single_for_cpu(struct pci_dev *pdev, dma_addr_t bus_addr, size
 /* Make physical memory consistent for a set of streaming
 /* Make physical memory consistent for a set of streaming
  * mode DMA translations after a transfer.
  * mode DMA translations after a transfer.
  */
  */
-void pci_dma_sync_sg_for_cpu(struct pci_dev *pdev, struct scatterlist *sglist, int nelems, int direction)
+static void pci_4u_dma_sync_sg_for_cpu(struct pci_dev *pdev, struct scatterlist *sglist, int nelems, int direction)
 {
 {
 	struct pcidev_cookie *pcp;
 	struct pcidev_cookie *pcp;
 	struct pci_iommu *iommu;
 	struct pci_iommu *iommu;
@@ -776,6 +775,17 @@ void pci_dma_sync_sg_for_cpu(struct pci_dev *pdev, struct scatterlist *sglist, i
 	spin_unlock_irqrestore(&iommu->lock, flags);
 	spin_unlock_irqrestore(&iommu->lock, flags);
 }
 }
 
 
+struct pci_iommu_ops pci_sun4u_iommu_ops = {
+	.alloc_consistent		= pci_4u_alloc_consistent,
+	.free_consistent		= pci_4u_free_consistent,
+	.map_single			= pci_4u_map_single,
+	.unmap_single			= pci_4u_unmap_single,
+	.map_sg				= pci_4u_map_sg,
+	.unmap_sg			= pci_4u_unmap_sg,
+	.dma_sync_single_for_cpu	= pci_4u_dma_sync_single_for_cpu,
+	.dma_sync_sg_for_cpu		= pci_4u_dma_sync_sg_for_cpu,
+};
+
 static void ali_sound_dma_hack(struct pci_dev *pdev, int set_bit)
 static void ali_sound_dma_hack(struct pci_dev *pdev, int set_bit)
 {
 {
 	struct pci_dev *ali_isa_bridge;
 	struct pci_dev *ali_isa_bridge;

+ 10 - 13
arch/sparc64/kernel/pci_psycho.c

@@ -286,17 +286,17 @@ static unsigned char psycho_pil_table[] = {
 /*0x14*/0, 0, 0, 0,	/* PCI B slot 1  Int A, B, C, D */
 /*0x14*/0, 0, 0, 0,	/* PCI B slot 1  Int A, B, C, D */
 /*0x18*/0, 0, 0, 0,	/* PCI B slot 2  Int A, B, C, D */
 /*0x18*/0, 0, 0, 0,	/* PCI B slot 2  Int A, B, C, D */
 /*0x1c*/0, 0, 0, 0,	/* PCI B slot 3  Int A, B, C, D */
 /*0x1c*/0, 0, 0, 0,	/* PCI B slot 3  Int A, B, C, D */
-/*0x20*/4,		/* SCSI				*/
+/*0x20*/5,		/* SCSI				*/
 /*0x21*/5,		/* Ethernet			*/
 /*0x21*/5,		/* Ethernet			*/
 /*0x22*/8,		/* Parallel Port		*/
 /*0x22*/8,		/* Parallel Port		*/
 /*0x23*/13,		/* Audio Record			*/
 /*0x23*/13,		/* Audio Record			*/
 /*0x24*/14,		/* Audio Playback		*/
 /*0x24*/14,		/* Audio Playback		*/
 /*0x25*/15,		/* PowerFail			*/
 /*0x25*/15,		/* PowerFail			*/
-/*0x26*/4,		/* second SCSI			*/
+/*0x26*/5,		/* second SCSI			*/
 /*0x27*/11,		/* Floppy			*/
 /*0x27*/11,		/* Floppy			*/
-/*0x28*/4,		/* Spare Hardware		*/
+/*0x28*/5,		/* Spare Hardware		*/
 /*0x29*/9,		/* Keyboard			*/
 /*0x29*/9,		/* Keyboard			*/
-/*0x2a*/4,		/* Mouse			*/
+/*0x2a*/5,		/* Mouse			*/
 /*0x2b*/12,		/* Serial			*/
 /*0x2b*/12,		/* Serial			*/
 /*0x2c*/10,		/* Timer 0			*/
 /*0x2c*/10,		/* Timer 0			*/
 /*0x2d*/11,		/* Timer 1			*/
 /*0x2d*/11,		/* Timer 1			*/
@@ -313,11 +313,11 @@ static int psycho_ino_to_pil(struct pci_dev *pdev, unsigned int ino)
 
 
 	ret = psycho_pil_table[ino];
 	ret = psycho_pil_table[ino];
 	if (ret == 0 && pdev == NULL) {
 	if (ret == 0 && pdev == NULL) {
-		ret = 4;
+		ret = 5;
 	} else if (ret == 0) {
 	} else if (ret == 0) {
 		switch ((pdev->class >> 16) & 0xff) {
 		switch ((pdev->class >> 16) & 0xff) {
 		case PCI_BASE_CLASS_STORAGE:
 		case PCI_BASE_CLASS_STORAGE:
-			ret = 4;
+			ret = 5;
 			break;
 			break;
 
 
 		case PCI_BASE_CLASS_NETWORK:
 		case PCI_BASE_CLASS_NETWORK:
@@ -336,7 +336,7 @@ static int psycho_ino_to_pil(struct pci_dev *pdev, unsigned int ino)
 			break;
 			break;
 
 
 		default:
 		default:
-			ret = 4;
+			ret = 5;
 			break;
 			break;
 		};
 		};
 	}
 	}
@@ -1164,7 +1164,7 @@ static void pbm_config_busmastering(struct pci_pbm_info *pbm)
 static void pbm_scan_bus(struct pci_controller_info *p,
 static void pbm_scan_bus(struct pci_controller_info *p,
 			 struct pci_pbm_info *pbm)
 			 struct pci_pbm_info *pbm)
 {
 {
-	struct pcidev_cookie *cookie = kmalloc(sizeof(*cookie), GFP_KERNEL);
+	struct pcidev_cookie *cookie = kzalloc(sizeof(*cookie), GFP_KERNEL);
 
 
 	if (!cookie) {
 	if (!cookie) {
 		prom_printf("PSYCHO: Critical allocation failure.\n");
 		prom_printf("PSYCHO: Critical allocation failure.\n");
@@ -1172,7 +1172,6 @@ static void pbm_scan_bus(struct pci_controller_info *p,
 	}
 	}
 
 
 	/* All we care about is the PBM. */
 	/* All we care about is the PBM. */
-	memset(cookie, 0, sizeof(*cookie));
 	cookie->pbm = pbm;
 	cookie->pbm = pbm;
 
 
 	pbm->pci_bus = pci_scan_bus(pbm->pci_first_busno,
 	pbm->pci_bus = pci_scan_bus(pbm->pci_first_busno,
@@ -1465,18 +1464,16 @@ void psycho_init(int node, char *model_name)
 		}
 		}
 	}
 	}
 
 
-	p = kmalloc(sizeof(struct pci_controller_info), GFP_ATOMIC);
+	p = kzalloc(sizeof(struct pci_controller_info), GFP_ATOMIC);
 	if (!p) {
 	if (!p) {
 		prom_printf("PSYCHO: Fatal memory allocation error.\n");
 		prom_printf("PSYCHO: Fatal memory allocation error.\n");
 		prom_halt();
 		prom_halt();
 	}
 	}
-	memset(p, 0, sizeof(*p));
-	iommu = kmalloc(sizeof(struct pci_iommu), GFP_ATOMIC);
+	iommu = kzalloc(sizeof(struct pci_iommu), GFP_ATOMIC);
 	if (!iommu) {
 	if (!iommu) {
 		prom_printf("PSYCHO: Fatal memory allocation error.\n");
 		prom_printf("PSYCHO: Fatal memory allocation error.\n");
 		prom_halt();
 		prom_halt();
 	}
 	}
-	memset(iommu, 0, sizeof(*iommu));
 	p->pbm_A.iommu = p->pbm_B.iommu = iommu;
 	p->pbm_A.iommu = p->pbm_B.iommu = iommu;
 
 
 	p->next = pci_controller_root;
 	p->next = pci_controller_root;

+ 10 - 13
arch/sparc64/kernel/pci_sabre.c

@@ -533,17 +533,17 @@ static unsigned char sabre_pil_table[] = {
 /*0x14*/0, 0, 0, 0,	/* PCI B slot 1  Int A, B, C, D */
 /*0x14*/0, 0, 0, 0,	/* PCI B slot 1  Int A, B, C, D */
 /*0x18*/0, 0, 0, 0,	/* PCI B slot 2  Int A, B, C, D */
 /*0x18*/0, 0, 0, 0,	/* PCI B slot 2  Int A, B, C, D */
 /*0x1c*/0, 0, 0, 0,	/* PCI B slot 3  Int A, B, C, D */
 /*0x1c*/0, 0, 0, 0,	/* PCI B slot 3  Int A, B, C, D */
-/*0x20*/4,		/* SCSI				*/
+/*0x20*/5,		/* SCSI				*/
 /*0x21*/5,		/* Ethernet			*/
 /*0x21*/5,		/* Ethernet			*/
 /*0x22*/8,		/* Parallel Port		*/
 /*0x22*/8,		/* Parallel Port		*/
 /*0x23*/13,		/* Audio Record			*/
 /*0x23*/13,		/* Audio Record			*/
 /*0x24*/14,		/* Audio Playback		*/
 /*0x24*/14,		/* Audio Playback		*/
 /*0x25*/15,		/* PowerFail			*/
 /*0x25*/15,		/* PowerFail			*/
-/*0x26*/4,		/* second SCSI			*/
+/*0x26*/5,		/* second SCSI			*/
 /*0x27*/11,		/* Floppy			*/
 /*0x27*/11,		/* Floppy			*/
-/*0x28*/4,		/* Spare Hardware		*/
+/*0x28*/5,		/* Spare Hardware		*/
 /*0x29*/9,		/* Keyboard			*/
 /*0x29*/9,		/* Keyboard			*/
-/*0x2a*/4,		/* Mouse			*/
+/*0x2a*/5,		/* Mouse			*/
 /*0x2b*/12,		/* Serial			*/
 /*0x2b*/12,		/* Serial			*/
 /*0x2c*/10,		/* Timer 0			*/
 /*0x2c*/10,		/* Timer 0			*/
 /*0x2d*/11,		/* Timer 1			*/
 /*0x2d*/11,		/* Timer 1			*/
@@ -565,11 +565,11 @@ static int sabre_ino_to_pil(struct pci_dev *pdev, unsigned int ino)
 
 
 	ret = sabre_pil_table[ino];
 	ret = sabre_pil_table[ino];
 	if (ret == 0 && pdev == NULL) {
 	if (ret == 0 && pdev == NULL) {
-		ret = 4;
+		ret = 5;
 	} else if (ret == 0) {
 	} else if (ret == 0) {
 		switch ((pdev->class >> 16) & 0xff) {
 		switch ((pdev->class >> 16) & 0xff) {
 		case PCI_BASE_CLASS_STORAGE:
 		case PCI_BASE_CLASS_STORAGE:
-			ret = 4;
+			ret = 5;
 			break;
 			break;
 
 
 		case PCI_BASE_CLASS_NETWORK:
 		case PCI_BASE_CLASS_NETWORK:
@@ -588,7 +588,7 @@ static int sabre_ino_to_pil(struct pci_dev *pdev, unsigned int ino)
 			break;
 			break;
 
 
 		default:
 		default:
-			ret = 4;
+			ret = 5;
 			break;
 			break;
 		};
 		};
 	}
 	}
@@ -1167,7 +1167,7 @@ static void apb_init(struct pci_controller_info *p, struct pci_bus *sabre_bus)
 
 
 static struct pcidev_cookie *alloc_bridge_cookie(struct pci_pbm_info *pbm)
 static struct pcidev_cookie *alloc_bridge_cookie(struct pci_pbm_info *pbm)
 {
 {
-	struct pcidev_cookie *cookie = kmalloc(sizeof(*cookie), GFP_KERNEL);
+	struct pcidev_cookie *cookie = kzalloc(sizeof(*cookie), GFP_KERNEL);
 
 
 	if (!cookie) {
 	if (!cookie) {
 		prom_printf("SABRE: Critical allocation failure.\n");
 		prom_printf("SABRE: Critical allocation failure.\n");
@@ -1175,7 +1175,6 @@ static struct pcidev_cookie *alloc_bridge_cookie(struct pci_pbm_info *pbm)
 	}
 	}
 
 
 	/* All we care about is the PBM. */
 	/* All we care about is the PBM. */
-	memset(cookie, 0, sizeof(*cookie));
 	cookie->pbm = pbm;
 	cookie->pbm = pbm;
 
 
 	return cookie;
 	return cookie;
@@ -1556,19 +1555,17 @@ void sabre_init(int pnode, char *model_name)
 		}
 		}
 	}
 	}
 
 
-	p = kmalloc(sizeof(*p), GFP_ATOMIC);
+	p = kzalloc(sizeof(*p), GFP_ATOMIC);
 	if (!p) {
 	if (!p) {
 		prom_printf("SABRE: Error, kmalloc(pci_controller_info) failed.\n");
 		prom_printf("SABRE: Error, kmalloc(pci_controller_info) failed.\n");
 		prom_halt();
 		prom_halt();
 	}
 	}
-	memset(p, 0, sizeof(*p));
 
 
-	iommu = kmalloc(sizeof(*iommu), GFP_ATOMIC);
+	iommu = kzalloc(sizeof(*iommu), GFP_ATOMIC);
 	if (!iommu) {
 	if (!iommu) {
 		prom_printf("SABRE: Error, kmalloc(pci_iommu) failed.\n");
 		prom_printf("SABRE: Error, kmalloc(pci_iommu) failed.\n");
 		prom_halt();
 		prom_halt();
 	}
 	}
-	memset(iommu, 0, sizeof(*iommu));
 	p->pbm_A.iommu = p->pbm_B.iommu = iommu;
 	p->pbm_A.iommu = p->pbm_B.iommu = iommu;
 
 
 	upa_portid = prom_getintdefault(pnode, "upa-portid", 0xff);
 	upa_portid = prom_getintdefault(pnode, "upa-portid", 0xff);

+ 10 - 14
arch/sparc64/kernel/pci_schizo.c

@@ -243,8 +243,8 @@ static unsigned char schizo_pil_table[] = {
 /*0x0c*/0, 0, 0, 0,	/* PCI slot 3  Int A, B, C, D	*/
 /*0x0c*/0, 0, 0, 0,	/* PCI slot 3  Int A, B, C, D	*/
 /*0x10*/0, 0, 0, 0,	/* PCI slot 4  Int A, B, C, D	*/
 /*0x10*/0, 0, 0, 0,	/* PCI slot 4  Int A, B, C, D	*/
 /*0x14*/0, 0, 0, 0,	/* PCI slot 5  Int A, B, C, D	*/
 /*0x14*/0, 0, 0, 0,	/* PCI slot 5  Int A, B, C, D	*/
-/*0x18*/4,		/* SCSI				*/
-/*0x19*/4,		/* second SCSI			*/
+/*0x18*/5,		/* SCSI				*/
+/*0x19*/5,		/* second SCSI			*/
 /*0x1a*/0,		/* UNKNOWN			*/
 /*0x1a*/0,		/* UNKNOWN			*/
 /*0x1b*/0,		/* UNKNOWN			*/
 /*0x1b*/0,		/* UNKNOWN			*/
 /*0x1c*/8,		/* Parallel			*/
 /*0x1c*/8,		/* Parallel			*/
@@ -254,7 +254,7 @@ static unsigned char schizo_pil_table[] = {
 /*0x20*/13,		/* Audio Record			*/
 /*0x20*/13,		/* Audio Record			*/
 /*0x21*/14,		/* Audio Playback		*/
 /*0x21*/14,		/* Audio Playback		*/
 /*0x22*/12,		/* Serial			*/
 /*0x22*/12,		/* Serial			*/
-/*0x23*/4,		/* EBUS I2C 			*/
+/*0x23*/5,		/* EBUS I2C 			*/
 /*0x24*/10,		/* RTC Clock			*/
 /*0x24*/10,		/* RTC Clock			*/
 /*0x25*/11,		/* Floppy			*/
 /*0x25*/11,		/* Floppy			*/
 /*0x26*/0,		/* UNKNOWN			*/
 /*0x26*/0,		/* UNKNOWN			*/
@@ -296,11 +296,11 @@ static int schizo_ino_to_pil(struct pci_dev *pdev, unsigned int ino)
 
 
 	ret = schizo_pil_table[ino];
 	ret = schizo_pil_table[ino];
 	if (ret == 0 && pdev == NULL) {
 	if (ret == 0 && pdev == NULL) {
-		ret = 4;
+		ret = 5;
 	} else if (ret == 0) {
 	} else if (ret == 0) {
 		switch ((pdev->class >> 16) & 0xff) {
 		switch ((pdev->class >> 16) & 0xff) {
 		case PCI_BASE_CLASS_STORAGE:
 		case PCI_BASE_CLASS_STORAGE:
-			ret = 4;
+			ret = 5;
 			break;
 			break;
 
 
 		case PCI_BASE_CLASS_NETWORK:
 		case PCI_BASE_CLASS_NETWORK:
@@ -319,7 +319,7 @@ static int schizo_ino_to_pil(struct pci_dev *pdev, unsigned int ino)
 			break;
 			break;
 
 
 		default:
 		default:
-			ret = 4;
+			ret = 5;
 			break;
 			break;
 		};
 		};
 	}
 	}
@@ -1525,7 +1525,7 @@ static void pbm_config_busmastering(struct pci_pbm_info *pbm)
 static void pbm_scan_bus(struct pci_controller_info *p,
 static void pbm_scan_bus(struct pci_controller_info *p,
 			 struct pci_pbm_info *pbm)
 			 struct pci_pbm_info *pbm)
 {
 {
-	struct pcidev_cookie *cookie = kmalloc(sizeof(*cookie), GFP_KERNEL);
+	struct pcidev_cookie *cookie = kzalloc(sizeof(*cookie), GFP_KERNEL);
 
 
 	if (!cookie) {
 	if (!cookie) {
 		prom_printf("%s: Critical allocation failure.\n", pbm->name);
 		prom_printf("%s: Critical allocation failure.\n", pbm->name);
@@ -1533,7 +1533,6 @@ static void pbm_scan_bus(struct pci_controller_info *p,
 	}
 	}
 
 
 	/* All we care about is the PBM. */
 	/* All we care about is the PBM. */
-	memset(cookie, 0, sizeof(*cookie));
 	cookie->pbm = pbm;
 	cookie->pbm = pbm;
 
 
 	pbm->pci_bus = pci_scan_bus(pbm->pci_first_busno,
 	pbm->pci_bus = pci_scan_bus(pbm->pci_first_busno,
@@ -2120,27 +2119,24 @@ static void __schizo_init(int node, char *model_name, int chip_type)
 		}
 		}
 	}
 	}
 
 
-	p = kmalloc(sizeof(struct pci_controller_info), GFP_ATOMIC);
+	p = kzalloc(sizeof(struct pci_controller_info), GFP_ATOMIC);
 	if (!p) {
 	if (!p) {
 		prom_printf("SCHIZO: Fatal memory allocation error.\n");
 		prom_printf("SCHIZO: Fatal memory allocation error.\n");
 		prom_halt();
 		prom_halt();
 	}
 	}
-	memset(p, 0, sizeof(*p));
 
 
-	iommu = kmalloc(sizeof(struct pci_iommu), GFP_ATOMIC);
+	iommu = kzalloc(sizeof(struct pci_iommu), GFP_ATOMIC);
 	if (!iommu) {
 	if (!iommu) {
 		prom_printf("SCHIZO: Fatal memory allocation error.\n");
 		prom_printf("SCHIZO: Fatal memory allocation error.\n");
 		prom_halt();
 		prom_halt();
 	}
 	}
-	memset(iommu, 0, sizeof(*iommu));
 	p->pbm_A.iommu = iommu;
 	p->pbm_A.iommu = iommu;
 
 
-	iommu = kmalloc(sizeof(struct pci_iommu), GFP_ATOMIC);
+	iommu = kzalloc(sizeof(struct pci_iommu), GFP_ATOMIC);
 	if (!iommu) {
 	if (!iommu) {
 		prom_printf("SCHIZO: Fatal memory allocation error.\n");
 		prom_printf("SCHIZO: Fatal memory allocation error.\n");
 		prom_halt();
 		prom_halt();
 	}
 	}
-	memset(iommu, 0, sizeof(*iommu));
 	p->pbm_B.iommu = iommu;
 	p->pbm_B.iommu = iommu;
 
 
 	p->next = pci_controller_root;
 	p->next = pci_controller_root;

+ 1147 - 0
arch/sparc64/kernel/pci_sun4v.c

@@ -0,0 +1,1147 @@
+/* pci_sun4v.c: SUN4V specific PCI controller support.
+ *
+ * Copyright (C) 2006 David S. Miller (davem@davemloft.net)
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+
+#include <asm/pbm.h>
+#include <asm/iommu.h>
+#include <asm/irq.h>
+#include <asm/upa.h>
+#include <asm/pstate.h>
+#include <asm/oplib.h>
+#include <asm/hypervisor.h>
+
+#include "pci_impl.h"
+#include "iommu_common.h"
+
+#include "pci_sun4v.h"
+
+#define PGLIST_NENTS	(PAGE_SIZE / sizeof(u64))
+
+struct pci_iommu_batch {
+	struct pci_dev	*pdev;		/* Device mapping is for.	*/
+	unsigned long	prot;		/* IOMMU page protections	*/
+	unsigned long	entry;		/* Index into IOTSB.		*/
+	u64		*pglist;	/* List of physical pages	*/
+	unsigned long	npages;		/* Number of pages in list.	*/
+};
+
+static DEFINE_PER_CPU(struct pci_iommu_batch, pci_iommu_batch);
+
+/* Interrupts must be disabled.  */
+static inline void pci_iommu_batch_start(struct pci_dev *pdev, unsigned long prot, unsigned long entry)
+{
+	struct pci_iommu_batch *p = &__get_cpu_var(pci_iommu_batch);
+
+	p->pdev		= pdev;
+	p->prot		= prot;
+	p->entry	= entry;
+	p->npages	= 0;
+}
+
+/* Interrupts must be disabled.  */
+static long pci_iommu_batch_flush(struct pci_iommu_batch *p)
+{
+	struct pcidev_cookie *pcp = p->pdev->sysdata;
+	unsigned long devhandle = pcp->pbm->devhandle;
+	unsigned long prot = p->prot;
+	unsigned long entry = p->entry;
+	u64 *pglist = p->pglist;
+	unsigned long npages = p->npages;
+
+	while (npages != 0) {
+		long num;
+
+		num = pci_sun4v_iommu_map(devhandle, HV_PCI_TSBID(0, entry),
+					  npages, prot, __pa(pglist));
+		if (unlikely(num < 0)) {
+			if (printk_ratelimit())
+				printk("pci_iommu_batch_flush: IOMMU map of "
+				       "[%08lx:%08lx:%lx:%lx:%lx] failed with "
+				       "status %ld\n",
+				       devhandle, HV_PCI_TSBID(0, entry),
+				       npages, prot, __pa(pglist), num);
+			return -1;
+		}
+
+		entry += num;
+		npages -= num;
+		pglist += num;
+	}
+
+	p->entry = entry;
+	p->npages = 0;
+
+	return 0;
+}
+
+/* Interrupts must be disabled.  */
+static inline long pci_iommu_batch_add(u64 phys_page)
+{
+	struct pci_iommu_batch *p = &__get_cpu_var(pci_iommu_batch);
+
+	BUG_ON(p->npages >= PGLIST_NENTS);
+
+	p->pglist[p->npages++] = phys_page;
+	if (p->npages == PGLIST_NENTS)
+		return pci_iommu_batch_flush(p);
+
+	return 0;
+}
+
+/* Interrupts must be disabled.  */
+static inline long pci_iommu_batch_end(void)
+{
+	struct pci_iommu_batch *p = &__get_cpu_var(pci_iommu_batch);
+
+	BUG_ON(p->npages >= PGLIST_NENTS);
+
+	return pci_iommu_batch_flush(p);
+}
+
+static long pci_arena_alloc(struct pci_iommu_arena *arena, unsigned long npages)
+{
+	unsigned long n, i, start, end, limit;
+	int pass;
+
+	limit = arena->limit;
+	start = arena->hint;
+	pass = 0;
+
+again:
+	n = find_next_zero_bit(arena->map, limit, start);
+	end = n + npages;
+	if (unlikely(end >= limit)) {
+		if (likely(pass < 1)) {
+			limit = start;
+			start = 0;
+			pass++;
+			goto again;
+		} else {
+			/* Scanned the whole thing, give up. */
+			return -1;
+		}
+	}
+
+	for (i = n; i < end; i++) {
+		if (test_bit(i, arena->map)) {
+			start = i + 1;
+			goto again;
+		}
+	}
+
+	for (i = n; i < end; i++)
+		__set_bit(i, arena->map);
+
+	arena->hint = end;
+
+	return n;
+}
+
+static void pci_arena_free(struct pci_iommu_arena *arena, unsigned long base, unsigned long npages)
+{
+	unsigned long i;
+
+	for (i = base; i < (base + npages); i++)
+		__clear_bit(i, arena->map);
+}
+
+static void *pci_4v_alloc_consistent(struct pci_dev *pdev, size_t size, dma_addr_t *dma_addrp)
+{
+	struct pcidev_cookie *pcp;
+	struct pci_iommu *iommu;
+	unsigned long flags, order, first_page, npages, n;
+	void *ret;
+	long entry;
+
+	size = IO_PAGE_ALIGN(size);
+	order = get_order(size);
+	if (unlikely(order >= MAX_ORDER))
+		return NULL;
+
+	npages = size >> IO_PAGE_SHIFT;
+
+	first_page = __get_free_pages(GFP_ATOMIC, order);
+	if (unlikely(first_page == 0UL))
+		return NULL;
+
+	memset((char *)first_page, 0, PAGE_SIZE << order);
+
+	pcp = pdev->sysdata;
+	iommu = pcp->pbm->iommu;
+
+	spin_lock_irqsave(&iommu->lock, flags);
+	entry = pci_arena_alloc(&iommu->arena, npages);
+	spin_unlock_irqrestore(&iommu->lock, flags);
+
+	if (unlikely(entry < 0L))
+		goto arena_alloc_fail;
+
+	*dma_addrp = (iommu->page_table_map_base +
+		      (entry << IO_PAGE_SHIFT));
+	ret = (void *) first_page;
+	first_page = __pa(first_page);
+
+	local_irq_save(flags);
+
+	pci_iommu_batch_start(pdev,
+			      (HV_PCI_MAP_ATTR_READ |
+			       HV_PCI_MAP_ATTR_WRITE),
+			      entry);
+
+	for (n = 0; n < npages; n++) {
+		long err = pci_iommu_batch_add(first_page + (n * PAGE_SIZE));
+		if (unlikely(err < 0L))
+			goto iommu_map_fail;
+	}
+
+	if (unlikely(pci_iommu_batch_end() < 0L))
+		goto iommu_map_fail;
+
+	local_irq_restore(flags);
+
+	return ret;
+
+iommu_map_fail:
+	/* Interrupts are disabled.  */
+	spin_lock(&iommu->lock);
+	pci_arena_free(&iommu->arena, entry, npages);
+	spin_unlock_irqrestore(&iommu->lock, flags);
+
+arena_alloc_fail:
+	free_pages(first_page, order);
+	return NULL;
+}
+
+static void pci_4v_free_consistent(struct pci_dev *pdev, size_t size, void *cpu, dma_addr_t dvma)
+{
+	struct pcidev_cookie *pcp;
+	struct pci_iommu *iommu;
+	unsigned long flags, order, npages, entry;
+	u32 devhandle;
+
+	npages = IO_PAGE_ALIGN(size) >> IO_PAGE_SHIFT;
+	pcp = pdev->sysdata;
+	iommu = pcp->pbm->iommu;
+	devhandle = pcp->pbm->devhandle;
+	entry = ((dvma - iommu->page_table_map_base) >> IO_PAGE_SHIFT);
+
+	spin_lock_irqsave(&iommu->lock, flags);
+
+	pci_arena_free(&iommu->arena, entry, npages);
+
+	do {
+		unsigned long num;
+
+		num = pci_sun4v_iommu_demap(devhandle, HV_PCI_TSBID(0, entry),
+					    npages);
+		entry += num;
+		npages -= num;
+	} while (npages != 0);
+
+	spin_unlock_irqrestore(&iommu->lock, flags);
+
+	order = get_order(size);
+	if (order < 10)
+		free_pages((unsigned long)cpu, order);
+}
+
+static dma_addr_t pci_4v_map_single(struct pci_dev *pdev, void *ptr, size_t sz, int direction)
+{
+	struct pcidev_cookie *pcp;
+	struct pci_iommu *iommu;
+	unsigned long flags, npages, oaddr;
+	unsigned long i, base_paddr;
+	u32 bus_addr, ret;
+	unsigned long prot;
+	long entry;
+
+	pcp = pdev->sysdata;
+	iommu = pcp->pbm->iommu;
+
+	if (unlikely(direction == PCI_DMA_NONE))
+		goto bad;
+
+	oaddr = (unsigned long)ptr;
+	npages = IO_PAGE_ALIGN(oaddr + sz) - (oaddr & IO_PAGE_MASK);
+	npages >>= IO_PAGE_SHIFT;
+
+	spin_lock_irqsave(&iommu->lock, flags);
+	entry = pci_arena_alloc(&iommu->arena, npages);
+	spin_unlock_irqrestore(&iommu->lock, flags);
+
+	if (unlikely(entry < 0L))
+		goto bad;
+
+	bus_addr = (iommu->page_table_map_base +
+		    (entry << IO_PAGE_SHIFT));
+	ret = bus_addr | (oaddr & ~IO_PAGE_MASK);
+	base_paddr = __pa(oaddr & IO_PAGE_MASK);
+	prot = HV_PCI_MAP_ATTR_READ;
+	if (direction != PCI_DMA_TODEVICE)
+		prot |= HV_PCI_MAP_ATTR_WRITE;
+
+	local_irq_save(flags);
+
+	pci_iommu_batch_start(pdev, prot, entry);
+
+	for (i = 0; i < npages; i++, base_paddr += IO_PAGE_SIZE) {
+		long err = pci_iommu_batch_add(base_paddr);
+		if (unlikely(err < 0L))
+			goto iommu_map_fail;
+	}
+	if (unlikely(pci_iommu_batch_end() < 0L))
+		goto iommu_map_fail;
+
+	local_irq_restore(flags);
+
+	return ret;
+
+bad:
+	if (printk_ratelimit())
+		WARN_ON(1);
+	return PCI_DMA_ERROR_CODE;
+
+iommu_map_fail:
+	/* Interrupts are disabled.  */
+	spin_lock(&iommu->lock);
+	pci_arena_free(&iommu->arena, entry, npages);
+	spin_unlock_irqrestore(&iommu->lock, flags);
+
+	return PCI_DMA_ERROR_CODE;
+}
+
+static void pci_4v_unmap_single(struct pci_dev *pdev, dma_addr_t bus_addr, size_t sz, int direction)
+{
+	struct pcidev_cookie *pcp;
+	struct pci_iommu *iommu;
+	unsigned long flags, npages;
+	long entry;
+	u32 devhandle;
+
+	if (unlikely(direction == PCI_DMA_NONE)) {
+		if (printk_ratelimit())
+			WARN_ON(1);
+		return;
+	}
+
+	pcp = pdev->sysdata;
+	iommu = pcp->pbm->iommu;
+	devhandle = pcp->pbm->devhandle;
+
+	npages = IO_PAGE_ALIGN(bus_addr + sz) - (bus_addr & IO_PAGE_MASK);
+	npages >>= IO_PAGE_SHIFT;
+	bus_addr &= IO_PAGE_MASK;
+
+	spin_lock_irqsave(&iommu->lock, flags);
+
+	entry = (bus_addr - iommu->page_table_map_base) >> IO_PAGE_SHIFT;
+	pci_arena_free(&iommu->arena, entry, npages);
+
+	do {
+		unsigned long num;
+
+		num = pci_sun4v_iommu_demap(devhandle, HV_PCI_TSBID(0, entry),
+					    npages);
+		entry += num;
+		npages -= num;
+	} while (npages != 0);
+
+	spin_unlock_irqrestore(&iommu->lock, flags);
+}
+
+#define SG_ENT_PHYS_ADDRESS(SG)	\
+	(__pa(page_address((SG)->page)) + (SG)->offset)
+
+static inline long fill_sg(long entry, struct pci_dev *pdev,
+			   struct scatterlist *sg,
+			   int nused, int nelems, unsigned long prot)
+{
+	struct scatterlist *dma_sg = sg;
+	struct scatterlist *sg_end = sg + nelems;
+	unsigned long flags;
+	int i;
+
+	local_irq_save(flags);
+
+	pci_iommu_batch_start(pdev, prot, entry);
+
+	for (i = 0; i < nused; i++) {
+		unsigned long pteval = ~0UL;
+		u32 dma_npages;
+
+		dma_npages = ((dma_sg->dma_address & (IO_PAGE_SIZE - 1UL)) +
+			      dma_sg->dma_length +
+			      ((IO_PAGE_SIZE - 1UL))) >> IO_PAGE_SHIFT;
+		do {
+			unsigned long offset;
+			signed int len;
+
+			/* If we are here, we know we have at least one
+			 * more page to map.  So walk forward until we
+			 * hit a page crossing, and begin creating new
+			 * mappings from that spot.
+			 */
+			for (;;) {
+				unsigned long tmp;
+
+				tmp = SG_ENT_PHYS_ADDRESS(sg);
+				len = sg->length;
+				if (((tmp ^ pteval) >> IO_PAGE_SHIFT) != 0UL) {
+					pteval = tmp & IO_PAGE_MASK;
+					offset = tmp & (IO_PAGE_SIZE - 1UL);
+					break;
+				}
+				if (((tmp ^ (tmp + len - 1UL)) >> IO_PAGE_SHIFT) != 0UL) {
+					pteval = (tmp + IO_PAGE_SIZE) & IO_PAGE_MASK;
+					offset = 0UL;
+					len -= (IO_PAGE_SIZE - (tmp & (IO_PAGE_SIZE - 1UL)));
+					break;
+				}
+				sg++;
+			}
+
+			pteval = (pteval & IOPTE_PAGE);
+			while (len > 0) {
+				long err;
+
+				err = pci_iommu_batch_add(pteval);
+				if (unlikely(err < 0L))
+					goto iommu_map_failed;
+
+				pteval += IO_PAGE_SIZE;
+				len -= (IO_PAGE_SIZE - offset);
+				offset = 0;
+				dma_npages--;
+			}
+
+			pteval = (pteval & IOPTE_PAGE) + len;
+			sg++;
+
+			/* Skip over any tail mappings we've fully mapped,
+			 * adjusting pteval along the way.  Stop when we
+			 * detect a page crossing event.
+			 */
+			while (sg < sg_end &&
+			       (pteval << (64 - IO_PAGE_SHIFT)) != 0UL &&
+			       (pteval == SG_ENT_PHYS_ADDRESS(sg)) &&
+			       ((pteval ^
+				 (SG_ENT_PHYS_ADDRESS(sg) + sg->length - 1UL)) >> IO_PAGE_SHIFT) == 0UL) {
+				pteval += sg->length;
+				sg++;
+			}
+			if ((pteval << (64 - IO_PAGE_SHIFT)) == 0UL)
+				pteval = ~0UL;
+		} while (dma_npages != 0);
+		dma_sg++;
+	}
+
+	if (unlikely(pci_iommu_batch_end() < 0L))
+		goto iommu_map_failed;
+
+	local_irq_restore(flags);
+	return 0;
+
+iommu_map_failed:
+	local_irq_restore(flags);
+	return -1L;
+}
+
+static int pci_4v_map_sg(struct pci_dev *pdev, struct scatterlist *sglist, int nelems, int direction)
+{
+	struct pcidev_cookie *pcp;
+	struct pci_iommu *iommu;
+	unsigned long flags, npages, prot;
+	u32 dma_base;
+	struct scatterlist *sgtmp;
+	long entry, err;
+	int used;
+
+	/* Fast path single entry scatterlists. */
+	if (nelems == 1) {
+		sglist->dma_address =
+			pci_4v_map_single(pdev,
+					  (page_address(sglist->page) + sglist->offset),
+					  sglist->length, direction);
+		if (unlikely(sglist->dma_address == PCI_DMA_ERROR_CODE))
+			return 0;
+		sglist->dma_length = sglist->length;
+		return 1;
+	}
+
+	pcp = pdev->sysdata;
+	iommu = pcp->pbm->iommu;
+	
+	if (unlikely(direction == PCI_DMA_NONE))
+		goto bad;
+
+	/* Step 1: Prepare scatter list. */
+	npages = prepare_sg(sglist, nelems);
+
+	/* Step 2: Allocate a cluster and context, if necessary. */
+	spin_lock_irqsave(&iommu->lock, flags);
+	entry = pci_arena_alloc(&iommu->arena, npages);
+	spin_unlock_irqrestore(&iommu->lock, flags);
+
+	if (unlikely(entry < 0L))
+		goto bad;
+
+	dma_base = iommu->page_table_map_base +
+		(entry << IO_PAGE_SHIFT);
+
+	/* Step 3: Normalize DMA addresses. */
+	used = nelems;
+
+	sgtmp = sglist;
+	while (used && sgtmp->dma_length) {
+		sgtmp->dma_address += dma_base;
+		sgtmp++;
+		used--;
+	}
+	used = nelems - used;
+
+	/* Step 4: Create the mappings. */
+	prot = HV_PCI_MAP_ATTR_READ;
+	if (direction != PCI_DMA_TODEVICE)
+		prot |= HV_PCI_MAP_ATTR_WRITE;
+
+	err = fill_sg(entry, pdev, sglist, used, nelems, prot);
+	if (unlikely(err < 0L))
+		goto iommu_map_failed;
+
+	return used;
+
+bad:
+	if (printk_ratelimit())
+		WARN_ON(1);
+	return 0;
+
+iommu_map_failed:
+	spin_lock_irqsave(&iommu->lock, flags);
+	pci_arena_free(&iommu->arena, entry, npages);
+	spin_unlock_irqrestore(&iommu->lock, flags);
+
+	return 0;
+}
+
+static void pci_4v_unmap_sg(struct pci_dev *pdev, struct scatterlist *sglist, int nelems, int direction)
+{
+	struct pcidev_cookie *pcp;
+	struct pci_iommu *iommu;
+	unsigned long flags, i, npages;
+	long entry;
+	u32 devhandle, bus_addr;
+
+	if (unlikely(direction == PCI_DMA_NONE)) {
+		if (printk_ratelimit())
+			WARN_ON(1);
+	}
+
+	pcp = pdev->sysdata;
+	iommu = pcp->pbm->iommu;
+	devhandle = pcp->pbm->devhandle;
+	
+	bus_addr = sglist->dma_address & IO_PAGE_MASK;
+
+	for (i = 1; i < nelems; i++)
+		if (sglist[i].dma_length == 0)
+			break;
+	i--;
+	npages = (IO_PAGE_ALIGN(sglist[i].dma_address + sglist[i].dma_length) -
+		  bus_addr) >> IO_PAGE_SHIFT;
+
+	entry = ((bus_addr - iommu->page_table_map_base) >> IO_PAGE_SHIFT);
+
+	spin_lock_irqsave(&iommu->lock, flags);
+
+	pci_arena_free(&iommu->arena, entry, npages);
+
+	do {
+		unsigned long num;
+
+		num = pci_sun4v_iommu_demap(devhandle, HV_PCI_TSBID(0, entry),
+					    npages);
+		entry += num;
+		npages -= num;
+	} while (npages != 0);
+
+	spin_unlock_irqrestore(&iommu->lock, flags);
+}
+
+static void pci_4v_dma_sync_single_for_cpu(struct pci_dev *pdev, dma_addr_t bus_addr, size_t sz, int direction)
+{
+	/* Nothing to do... */
+}
+
+static void pci_4v_dma_sync_sg_for_cpu(struct pci_dev *pdev, struct scatterlist *sglist, int nelems, int direction)
+{
+	/* Nothing to do... */
+}
+
+struct pci_iommu_ops pci_sun4v_iommu_ops = {
+	.alloc_consistent		= pci_4v_alloc_consistent,
+	.free_consistent		= pci_4v_free_consistent,
+	.map_single			= pci_4v_map_single,
+	.unmap_single			= pci_4v_unmap_single,
+	.map_sg				= pci_4v_map_sg,
+	.unmap_sg			= pci_4v_unmap_sg,
+	.dma_sync_single_for_cpu	= pci_4v_dma_sync_single_for_cpu,
+	.dma_sync_sg_for_cpu		= pci_4v_dma_sync_sg_for_cpu,
+};
+
+/* SUN4V PCI configuration space accessors. */
+
+static inline int pci_sun4v_out_of_range(struct pci_pbm_info *pbm, unsigned int bus, unsigned int device, unsigned int func)
+{
+	if (bus == pbm->pci_first_busno) {
+		if (device == 0 && func == 0)
+			return 0;
+		return 1;
+	}
+
+	if (bus < pbm->pci_first_busno ||
+	    bus > pbm->pci_last_busno)
+		return 1;
+	return 0;
+}
+
+static int pci_sun4v_read_pci_cfg(struct pci_bus *bus_dev, unsigned int devfn,
+				  int where, int size, u32 *value)
+{
+	struct pci_pbm_info *pbm = bus_dev->sysdata;
+	u32 devhandle = pbm->devhandle;
+	unsigned int bus = bus_dev->number;
+	unsigned int device = PCI_SLOT(devfn);
+	unsigned int func = PCI_FUNC(devfn);
+	unsigned long ret;
+
+	if (pci_sun4v_out_of_range(pbm, bus, device, func)) {
+		ret = ~0UL;
+	} else {
+		ret = pci_sun4v_config_get(devhandle,
+				HV_PCI_DEVICE_BUILD(bus, device, func),
+				where, size);
+#if 0
+		printk("rcfg: [%x:%x:%x:%d]=[%lx]\n",
+		       devhandle, HV_PCI_DEVICE_BUILD(bus, device, func),
+		       where, size, ret);
+#endif
+	}
+	switch (size) {
+	case 1:
+		*value = ret & 0xff;
+		break;
+	case 2:
+		*value = ret & 0xffff;
+		break;
+	case 4:
+		*value = ret & 0xffffffff;
+		break;
+	};
+
+
+	return PCIBIOS_SUCCESSFUL;
+}
+
+static int pci_sun4v_write_pci_cfg(struct pci_bus *bus_dev, unsigned int devfn,
+				   int where, int size, u32 value)
+{
+	struct pci_pbm_info *pbm = bus_dev->sysdata;
+	u32 devhandle = pbm->devhandle;
+	unsigned int bus = bus_dev->number;
+	unsigned int device = PCI_SLOT(devfn);
+	unsigned int func = PCI_FUNC(devfn);
+	unsigned long ret;
+
+	if (pci_sun4v_out_of_range(pbm, bus, device, func)) {
+		/* Do nothing. */
+	} else {
+		ret = pci_sun4v_config_put(devhandle,
+				HV_PCI_DEVICE_BUILD(bus, device, func),
+				where, size, value);
+#if 0
+		printk("wcfg: [%x:%x:%x:%d] v[%x] == [%lx]\n",
+		       devhandle, HV_PCI_DEVICE_BUILD(bus, device, func),
+		       where, size, value, ret);
+#endif
+	}
+	return PCIBIOS_SUCCESSFUL;
+}
+
+static struct pci_ops pci_sun4v_ops = {
+	.read =		pci_sun4v_read_pci_cfg,
+	.write =	pci_sun4v_write_pci_cfg,
+};
+
+
+static void pbm_scan_bus(struct pci_controller_info *p,
+			 struct pci_pbm_info *pbm)
+{
+	struct pcidev_cookie *cookie = kmalloc(sizeof(*cookie), GFP_KERNEL);
+
+	if (!cookie) {
+		prom_printf("%s: Critical allocation failure.\n", pbm->name);
+		prom_halt();
+	}
+
+	/* All we care about is the PBM. */
+	memset(cookie, 0, sizeof(*cookie));
+	cookie->pbm = pbm;
+
+	pbm->pci_bus = pci_scan_bus(pbm->pci_first_busno, p->pci_ops, pbm);
+#if 0
+	pci_fixup_host_bridge_self(pbm->pci_bus);
+	pbm->pci_bus->self->sysdata = cookie;
+#endif
+	pci_fill_in_pbm_cookies(pbm->pci_bus, pbm,
+				pbm->prom_node);
+	pci_record_assignments(pbm, pbm->pci_bus);
+	pci_assign_unassigned(pbm, pbm->pci_bus);
+	pci_fixup_irq(pbm, pbm->pci_bus);
+	pci_determine_66mhz_disposition(pbm, pbm->pci_bus);
+	pci_setup_busmastering(pbm, pbm->pci_bus);
+}
+
+static void pci_sun4v_scan_bus(struct pci_controller_info *p)
+{
+	if (p->pbm_A.prom_node) {
+		p->pbm_A.is_66mhz_capable =
+			prom_getbool(p->pbm_A.prom_node, "66mhz-capable");
+
+		pbm_scan_bus(p, &p->pbm_A);
+	}
+	if (p->pbm_B.prom_node) {
+		p->pbm_B.is_66mhz_capable =
+			prom_getbool(p->pbm_B.prom_node, "66mhz-capable");
+
+		pbm_scan_bus(p, &p->pbm_B);
+	}
+
+	/* XXX register error interrupt handlers XXX */
+}
+
+static unsigned int pci_sun4v_irq_build(struct pci_pbm_info *pbm,
+					struct pci_dev *pdev,
+					unsigned int devino)
+{
+	u32 devhandle = pbm->devhandle;
+	int pil;
+
+	pil = 5;
+	if (pdev) {
+		switch ((pdev->class >> 16) & 0xff) {
+		case PCI_BASE_CLASS_STORAGE:
+			pil = 5;
+			break;
+
+		case PCI_BASE_CLASS_NETWORK:
+			pil = 6;
+			break;
+
+		case PCI_BASE_CLASS_DISPLAY:
+			pil = 9;
+			break;
+
+		case PCI_BASE_CLASS_MULTIMEDIA:
+		case PCI_BASE_CLASS_MEMORY:
+		case PCI_BASE_CLASS_BRIDGE:
+		case PCI_BASE_CLASS_SERIAL:
+			pil = 10;
+			break;
+
+		default:
+			pil = 5;
+			break;
+		};
+	}
+	BUG_ON(PIL_RESERVED(pil));
+
+	return sun4v_build_irq(devhandle, devino, pil, IBF_PCI);
+}
+
+static void pci_sun4v_base_address_update(struct pci_dev *pdev, int resource)
+{
+	struct pcidev_cookie *pcp = pdev->sysdata;
+	struct pci_pbm_info *pbm = pcp->pbm;
+	struct resource *res, *root;
+	u32 reg;
+	int where, size, is_64bit;
+
+	res = &pdev->resource[resource];
+	if (resource < 6) {
+		where = PCI_BASE_ADDRESS_0 + (resource * 4);
+	} else if (resource == PCI_ROM_RESOURCE) {
+		where = pdev->rom_base_reg;
+	} else {
+		/* Somebody might have asked allocation of a non-standard resource */
+		return;
+	}
+
+	/* XXX 64-bit MEM handling is not %100 correct... XXX */
+	is_64bit = 0;
+	if (res->flags & IORESOURCE_IO)
+		root = &pbm->io_space;
+	else {
+		root = &pbm->mem_space;
+		if ((res->flags & PCI_BASE_ADDRESS_MEM_TYPE_MASK)
+		    == PCI_BASE_ADDRESS_MEM_TYPE_64)
+			is_64bit = 1;
+	}
+
+	size = res->end - res->start;
+	pci_read_config_dword(pdev, where, &reg);
+	reg = ((reg & size) |
+	       (((u32)(res->start - root->start)) & ~size));
+	if (resource == PCI_ROM_RESOURCE) {
+		reg |= PCI_ROM_ADDRESS_ENABLE;
+		res->flags |= IORESOURCE_ROM_ENABLE;
+	}
+	pci_write_config_dword(pdev, where, reg);
+
+	/* This knows that the upper 32-bits of the address
+	 * must be zero.  Our PCI common layer enforces this.
+	 */
+	if (is_64bit)
+		pci_write_config_dword(pdev, where + 4, 0);
+}
+
+static void pci_sun4v_resource_adjust(struct pci_dev *pdev,
+				      struct resource *res,
+				      struct resource *root)
+{
+	res->start += root->start;
+	res->end += root->start;
+}
+
+/* Use ranges property to determine where PCI MEM, I/O, and Config
+ * space are for this PCI bus module.
+ */
+static void pci_sun4v_determine_mem_io_space(struct pci_pbm_info *pbm)
+{
+	int i, saw_mem, saw_io;
+
+	saw_mem = saw_io = 0;
+	for (i = 0; i < pbm->num_pbm_ranges; i++) {
+		struct linux_prom_pci_ranges *pr = &pbm->pbm_ranges[i];
+		unsigned long a;
+		int type;
+
+		type = (pr->child_phys_hi >> 24) & 0x3;
+		a = (((unsigned long)pr->parent_phys_hi << 32UL) |
+		     ((unsigned long)pr->parent_phys_lo  <<  0UL));
+
+		switch (type) {
+		case 1:
+			/* 16-bit IO space, 16MB */
+			pbm->io_space.start = a;
+			pbm->io_space.end = a + ((16UL*1024UL*1024UL) - 1UL);
+			pbm->io_space.flags = IORESOURCE_IO;
+			saw_io = 1;
+			break;
+
+		case 2:
+			/* 32-bit MEM space, 2GB */
+			pbm->mem_space.start = a;
+			pbm->mem_space.end = a + (0x80000000UL - 1UL);
+			pbm->mem_space.flags = IORESOURCE_MEM;
+			saw_mem = 1;
+			break;
+
+		case 3:
+			/* XXX 64-bit MEM handling XXX */
+
+		default:
+			break;
+		};
+	}
+
+	if (!saw_io || !saw_mem) {
+		prom_printf("%s: Fatal error, missing %s PBM range.\n",
+			    pbm->name,
+			    (!saw_io ? "IO" : "MEM"));
+		prom_halt();
+	}
+
+	printk("%s: PCI IO[%lx] MEM[%lx]\n",
+	       pbm->name,
+	       pbm->io_space.start,
+	       pbm->mem_space.start);
+}
+
+static void pbm_register_toplevel_resources(struct pci_controller_info *p,
+					    struct pci_pbm_info *pbm)
+{
+	pbm->io_space.name = pbm->mem_space.name = pbm->name;
+
+	request_resource(&ioport_resource, &pbm->io_space);
+	request_resource(&iomem_resource, &pbm->mem_space);
+	pci_register_legacy_regions(&pbm->io_space,
+				    &pbm->mem_space);
+}
+
+static unsigned long probe_existing_entries(struct pci_pbm_info *pbm,
+					    struct pci_iommu *iommu)
+{
+	struct pci_iommu_arena *arena = &iommu->arena;
+	unsigned long i, cnt = 0;
+	u32 devhandle;
+
+	devhandle = pbm->devhandle;
+	for (i = 0; i < arena->limit; i++) {
+		unsigned long ret, io_attrs, ra;
+
+		ret = pci_sun4v_iommu_getmap(devhandle,
+					     HV_PCI_TSBID(0, i),
+					     &io_attrs, &ra);
+		if (ret == HV_EOK) {
+			cnt++;
+			__set_bit(i, arena->map);
+		}
+	}
+
+	return cnt;
+}
+
+static void pci_sun4v_iommu_init(struct pci_pbm_info *pbm)
+{
+	struct pci_iommu *iommu = pbm->iommu;
+	unsigned long num_tsb_entries, sz;
+	u32 vdma[2], dma_mask, dma_offset;
+	int err, tsbsize;
+
+	err = prom_getproperty(pbm->prom_node, "virtual-dma",
+			       (char *)&vdma[0], sizeof(vdma));
+	if (err == 0 || err == -1) {
+		/* No property, use default values. */
+		vdma[0] = 0x80000000;
+		vdma[1] = 0x80000000;
+	}
+
+	dma_mask = vdma[0];
+	switch (vdma[1]) {
+		case 0x20000000:
+			dma_mask |= 0x1fffffff;
+			tsbsize = 64;
+			break;
+
+		case 0x40000000:
+			dma_mask |= 0x3fffffff;
+			tsbsize = 128;
+			break;
+
+		case 0x80000000:
+			dma_mask |= 0x7fffffff;
+			tsbsize = 256;
+			break;
+
+		default:
+			prom_printf("PCI-SUN4V: strange virtual-dma size.\n");
+			prom_halt();
+	};
+
+	tsbsize *= (8 * 1024);
+
+	num_tsb_entries = tsbsize / sizeof(iopte_t);
+
+	dma_offset = vdma[0];
+
+	/* Setup initial software IOMMU state. */
+	spin_lock_init(&iommu->lock);
+	iommu->ctx_lowest_free = 1;
+	iommu->page_table_map_base = dma_offset;
+	iommu->dma_addr_mask = dma_mask;
+
+	/* Allocate and initialize the free area map.  */
+	sz = num_tsb_entries / 8;
+	sz = (sz + 7UL) & ~7UL;
+	iommu->arena.map = kmalloc(sz, GFP_KERNEL);
+	if (!iommu->arena.map) {
+		prom_printf("PCI_IOMMU: Error, kmalloc(arena.map) failed.\n");
+		prom_halt();
+	}
+	memset(iommu->arena.map, 0, sz);
+	iommu->arena.limit = num_tsb_entries;
+
+	sz = probe_existing_entries(pbm, iommu);
+
+	printk("%s: TSB entries [%lu], existing mapings [%lu]\n",
+	       pbm->name, num_tsb_entries, sz);
+}
+
+static void pci_sun4v_get_bus_range(struct pci_pbm_info *pbm)
+{
+	unsigned int busrange[2];
+	int prom_node = pbm->prom_node;
+	int err;
+
+	err = prom_getproperty(prom_node, "bus-range",
+			       (char *)&busrange[0],
+			       sizeof(busrange));
+	if (err == 0 || err == -1) {
+		prom_printf("%s: Fatal error, no bus-range.\n", pbm->name);
+		prom_halt();
+	}
+
+	pbm->pci_first_busno = busrange[0];
+	pbm->pci_last_busno = busrange[1];
+
+}
+
+static void pci_sun4v_pbm_init(struct pci_controller_info *p, int prom_node, u32 devhandle)
+{
+	struct pci_pbm_info *pbm;
+	int err, i;
+
+	if (devhandle & 0x40)
+		pbm = &p->pbm_B;
+	else
+		pbm = &p->pbm_A;
+
+	pbm->parent = p;
+	pbm->prom_node = prom_node;
+	pbm->pci_first_slot = 1;
+
+	pbm->devhandle = devhandle;
+
+	sprintf(pbm->name, "SUN4V-PCI%d PBM%c",
+		p->index, (pbm == &p->pbm_A ? 'A' : 'B'));
+
+	printk("%s: devhandle[%x] prom_node[%x:%x]\n",
+	       pbm->name, pbm->devhandle,
+	       pbm->prom_node, prom_getchild(pbm->prom_node));
+
+	prom_getstring(prom_node, "name",
+		       pbm->prom_name, sizeof(pbm->prom_name));
+
+	err = prom_getproperty(prom_node, "ranges",
+			       (char *) pbm->pbm_ranges,
+			       sizeof(pbm->pbm_ranges));
+	if (err == 0 || err == -1) {
+		prom_printf("%s: Fatal error, no ranges property.\n",
+			    pbm->name);
+		prom_halt();
+	}
+
+	pbm->num_pbm_ranges =
+		(err / sizeof(struct linux_prom_pci_ranges));
+
+	/* Mask out the top 8 bits of the ranges, leaving the real
+	 * physical address.
+	 */
+	for (i = 0; i < pbm->num_pbm_ranges; i++)
+		pbm->pbm_ranges[i].parent_phys_hi &= 0x0fffffff;
+
+	pci_sun4v_determine_mem_io_space(pbm);
+	pbm_register_toplevel_resources(p, pbm);
+
+	err = prom_getproperty(prom_node, "interrupt-map",
+			       (char *)pbm->pbm_intmap,
+			       sizeof(pbm->pbm_intmap));
+	if (err == 0 || err == -1) {
+		prom_printf("%s: Fatal error, no interrupt-map property.\n",
+			    pbm->name);
+		prom_halt();
+	}
+
+	pbm->num_pbm_intmap = (err / sizeof(struct linux_prom_pci_intmap));
+	err = prom_getproperty(prom_node, "interrupt-map-mask",
+			       (char *)&pbm->pbm_intmask,
+			       sizeof(pbm->pbm_intmask));
+	if (err == 0 || err == -1) {
+		prom_printf("%s: Fatal error, no interrupt-map-mask.\n",
+			    pbm->name);
+		prom_halt();
+	}
+
+	pci_sun4v_get_bus_range(pbm);
+	pci_sun4v_iommu_init(pbm);
+}
+
+void sun4v_pci_init(int node, char *model_name)
+{
+	struct pci_controller_info *p;
+	struct pci_iommu *iommu;
+	struct linux_prom64_registers regs;
+	u32 devhandle;
+	int i;
+
+	prom_getproperty(node, "reg", (char *)&regs, sizeof(regs));
+	devhandle = (regs.phys_addr >> 32UL) & 0x0fffffff;
+
+	for (p = pci_controller_root; p; p = p->next) {
+		struct pci_pbm_info *pbm;
+
+		if (p->pbm_A.prom_node && p->pbm_B.prom_node)
+			continue;
+
+		pbm = (p->pbm_A.prom_node ?
+		       &p->pbm_A :
+		       &p->pbm_B);
+
+		if (pbm->devhandle == (devhandle ^ 0x40)) {
+			pci_sun4v_pbm_init(p, node, devhandle);
+			return;
+		}
+	}
+
+	for_each_cpu(i) {
+		unsigned long page = get_zeroed_page(GFP_ATOMIC);
+
+		if (!page)
+			goto fatal_memory_error;
+
+		per_cpu(pci_iommu_batch, i).pglist = (u64 *) page;
+	}
+
+	p = kmalloc(sizeof(struct pci_controller_info), GFP_ATOMIC);
+	if (!p)
+		goto fatal_memory_error;
+
+	memset(p, 0, sizeof(*p));
+
+	iommu = kmalloc(sizeof(struct pci_iommu), GFP_ATOMIC);
+	if (!iommu)
+		goto fatal_memory_error;
+
+	memset(iommu, 0, sizeof(*iommu));
+	p->pbm_A.iommu = iommu;
+
+	iommu = kmalloc(sizeof(struct pci_iommu), GFP_ATOMIC);
+	if (!iommu)
+		goto fatal_memory_error;
+
+	memset(iommu, 0, sizeof(*iommu));
+	p->pbm_B.iommu = iommu;
+
+	p->next = pci_controller_root;
+	pci_controller_root = p;
+
+	p->index = pci_num_controllers++;
+	p->pbms_same_domain = 0;
+
+	p->scan_bus = pci_sun4v_scan_bus;
+	p->irq_build = pci_sun4v_irq_build;
+	p->base_address_update = pci_sun4v_base_address_update;
+	p->resource_adjust = pci_sun4v_resource_adjust;
+	p->pci_ops = &pci_sun4v_ops;
+
+	/* Like PSYCHO and SCHIZO we have a 2GB aligned area
+	 * for memory space.
+	 */
+	pci_memspace_mask = 0x7fffffffUL;
+
+	pci_sun4v_pbm_init(p, node, devhandle);
+	return;
+
+fatal_memory_error:
+	prom_printf("SUN4V_PCI: Fatal memory allocation error.\n");
+	prom_halt();
+}

+ 31 - 0
arch/sparc64/kernel/pci_sun4v.h

@@ -0,0 +1,31 @@
+/* pci_sun4v.h: SUN4V specific PCI controller support.
+ *
+ * Copyright (C) 2006 David S. Miller (davem@davemloft.net)
+ */
+
+#ifndef _PCI_SUN4V_H
+#define _PCI_SUN4V_H
+
+extern long pci_sun4v_iommu_map(unsigned long devhandle,
+				unsigned long tsbid,
+				unsigned long num_ttes,
+				unsigned long io_attributes,
+				unsigned long io_page_list_pa);
+extern unsigned long pci_sun4v_iommu_demap(unsigned long devhandle,
+					   unsigned long tsbid,
+					   unsigned long num_ttes);
+extern unsigned long pci_sun4v_iommu_getmap(unsigned long devhandle,
+					    unsigned long tsbid,
+					    unsigned long *io_attributes,
+					    unsigned long *real_address);
+extern unsigned long pci_sun4v_config_get(unsigned long devhandle,
+					  unsigned long pci_device,
+					  unsigned long config_offset,
+					  unsigned long size);
+extern int pci_sun4v_config_put(unsigned long devhandle,
+				unsigned long pci_device,
+				unsigned long config_offset,
+				unsigned long size,
+				unsigned long data);
+
+#endif /* !(_PCI_SUN4V_H) */

+ 95 - 0
arch/sparc64/kernel/pci_sun4v_asm.S

@@ -0,0 +1,95 @@
+/* pci_sun4v_asm: Hypervisor calls for PCI support.
+ *
+ * Copyright (C) 2006 David S. Miller <davem@davemloft.net>
+ */
+
+#include <asm/hypervisor.h>
+
+	/* %o0: devhandle
+	 * %o1:	tsbid
+	 * %o2:	num ttes
+	 * %o3:	io_attributes
+	 * %o4:	io_page_list phys address
+	 *
+	 * returns %o0:	-status if status was non-zero, else
+	 *         %o0:	num pages mapped
+	 */
+	.globl	pci_sun4v_iommu_map
+pci_sun4v_iommu_map:
+	mov	%o5, %g1
+	mov	HV_FAST_PCI_IOMMU_MAP, %o5
+	ta	HV_FAST_TRAP
+	brnz,pn %o0, 1f
+	 sub	%g0, %o0, %o0
+	mov	%o1, %o0
+1:	retl
+	 nop
+
+	/* %o0: devhandle
+	 * %o1:	tsbid
+	 * %o2:	num ttes
+	 *
+	 * returns %o0:	num ttes demapped
+	 */
+	.globl	pci_sun4v_iommu_demap
+pci_sun4v_iommu_demap:
+	mov	HV_FAST_PCI_IOMMU_DEMAP, %o5
+	ta	HV_FAST_TRAP
+	retl
+	 mov	%o1, %o0
+
+	/* %o0: devhandle
+	 * %o1:	tsbid
+	 * %o2:	&io_attributes
+	 * %o3:	&real_address
+	 *
+	 * returns %o0:	status
+	 */
+	.globl	pci_sun4v_iommu_getmap
+pci_sun4v_iommu_getmap:
+	mov	%o2, %o4
+	mov	HV_FAST_PCI_IOMMU_GETMAP, %o5
+	ta	HV_FAST_TRAP
+	stx	%o1, [%o4]
+	stx	%o2, [%o3]
+	retl
+	 mov	%o0, %o0
+
+	/* %o0: devhandle
+	 * %o1:	pci_device
+	 * %o2:	pci_config_offset
+	 * %o3:	size
+	 *
+	 * returns %o0:	data
+	 *
+	 * If there is an error, the data will be returned
+	 * as all 1's.
+	 */
+	.globl	pci_sun4v_config_get
+pci_sun4v_config_get:
+	mov	HV_FAST_PCI_CONFIG_GET, %o5
+	ta	HV_FAST_TRAP
+	brnz,a,pn %o1, 1f
+	 mov	-1, %o2
+1:	retl
+	 mov	%o2, %o0
+
+	/* %o0: devhandle
+	 * %o1:	pci_device
+	 * %o2:	pci_config_offset
+	 * %o3:	size
+	 * %o4:	data
+	 *
+	 * returns %o0:	status
+	 *
+	 * status will be zero if the operation completed
+	 * successfully, else -1 if not
+	 */
+	.globl	pci_sun4v_config_put
+pci_sun4v_config_put:
+	mov	HV_FAST_PCI_CONFIG_PUT, %o5
+	ta	HV_FAST_TRAP
+	brnz,a,pn %o1, 1f
+	 mov	-1, %o1
+1:	retl
+	 mov	%o1, %o0

+ 57 - 76
arch/sparc64/kernel/process.c

@@ -44,83 +44,61 @@
 #include <asm/fpumacro.h>
 #include <asm/fpumacro.h>
 #include <asm/head.h>
 #include <asm/head.h>
 #include <asm/cpudata.h>
 #include <asm/cpudata.h>
+#include <asm/mmu_context.h>
 #include <asm/unistd.h>
 #include <asm/unistd.h>
+#include <asm/hypervisor.h>
 
 
 /* #define VERBOSE_SHOWREGS */
 /* #define VERBOSE_SHOWREGS */
 
 
-/*
- * Nothing special yet...
- */
-void default_idle(void)
-{
-}
-
-#ifndef CONFIG_SMP
-
-/*
- * the idle loop on a Sparc... ;)
- */
-void cpu_idle(void)
+static void sparc64_yield(void)
 {
 {
-	/* endless idle loop with no priority at all */
-	for (;;) {
-		/* If current->work.need_resched is zero we should really
-		 * setup for a system wakup event and execute a shutdown
-		 * instruction.
-		 *
-		 * But this requires writing back the contents of the
-		 * L2 cache etc. so implement this later. -DaveM
-		 */
-		while (!need_resched())
-			barrier();
+	if (tlb_type != hypervisor)
+		return;
 
 
-		preempt_enable_no_resched();
-		schedule();
-		preempt_disable();
-		check_pgt_cache();
+	clear_thread_flag(TIF_POLLING_NRFLAG);
+	smp_mb__after_clear_bit();
+
+	while (!need_resched()) {
+		unsigned long pstate;
+
+		/* Disable interrupts. */
+		__asm__ __volatile__(
+			"rdpr %%pstate, %0\n\t"
+			"andn %0, %1, %0\n\t"
+			"wrpr %0, %%g0, %%pstate"
+			: "=&r" (pstate)
+			: "i" (PSTATE_IE));
+
+		if (!need_resched())
+			sun4v_cpu_yield();
+
+		/* Re-enable interrupts. */
+		__asm__ __volatile__(
+			"rdpr %%pstate, %0\n\t"
+			"or %0, %1, %0\n\t"
+			"wrpr %0, %%g0, %%pstate"
+			: "=&r" (pstate)
+			: "i" (PSTATE_IE));
 	}
 	}
-}
 
 
-#else
+	set_thread_flag(TIF_POLLING_NRFLAG);
+}
 
 
-/*
- * the idle loop on a UltraMultiPenguin...
- *
- * TIF_POLLING_NRFLAG is set because we do not sleep the cpu
- * inside of the idler task, so an interrupt is not needed
- * to get a clean fast response.
- *
- * XXX Reverify this assumption... -DaveM
- *
- * Addendum: We do want it to do something for the signal
- *           delivery case, we detect that by just seeing
- *           if we are trying to send this to an idler or not.
- */
+/* The idle loop on sparc64. */
 void cpu_idle(void)
 void cpu_idle(void)
 {
 {
-	cpuinfo_sparc *cpuinfo = &local_cpu_data();
 	set_thread_flag(TIF_POLLING_NRFLAG);
 	set_thread_flag(TIF_POLLING_NRFLAG);
 
 
 	while(1) {
 	while(1) {
 		if (need_resched()) {
 		if (need_resched()) {
-			cpuinfo->idle_volume = 0;
 			preempt_enable_no_resched();
 			preempt_enable_no_resched();
 			schedule();
 			schedule();
 			preempt_disable();
 			preempt_disable();
-			check_pgt_cache();
 		}
 		}
-		cpuinfo->idle_volume++;
-
-		/* The store ordering is so that IRQ handlers on
-		 * other cpus see our increasing idleness for the buddy
-		 * redistribution algorithm.  -DaveM
-		 */
-		membar_storeload_storestore();
+		sparc64_yield();
 	}
 	}
 }
 }
 
 
-#endif
-
 extern char reboot_command [];
 extern char reboot_command [];
 
 
 extern void (*prom_palette)(int);
 extern void (*prom_palette)(int);
@@ -354,6 +332,7 @@ void show_regs(struct pt_regs *regs)
 	extern long etrap, etraptl1;
 	extern long etrap, etraptl1;
 #endif
 #endif
 	__show_regs(regs);
 	__show_regs(regs);
+#if 0
 #ifdef CONFIG_SMP
 #ifdef CONFIG_SMP
 	{
 	{
 		extern void smp_report_regs(void);
 		extern void smp_report_regs(void);
@@ -361,6 +340,7 @@ void show_regs(struct pt_regs *regs)
 		smp_report_regs();
 		smp_report_regs();
 	}
 	}
 #endif
 #endif
+#endif
 
 
 #ifdef VERBOSE_SHOWREGS	
 #ifdef VERBOSE_SHOWREGS	
 	if (regs->tpc >= &etrap && regs->tpc < &etraptl1 &&
 	if (regs->tpc >= &etrap && regs->tpc < &etraptl1 &&
@@ -433,30 +413,15 @@ void exit_thread(void)
 void flush_thread(void)
 void flush_thread(void)
 {
 {
 	struct thread_info *t = current_thread_info();
 	struct thread_info *t = current_thread_info();
+	struct mm_struct *mm;
 
 
 	if (t->flags & _TIF_ABI_PENDING)
 	if (t->flags & _TIF_ABI_PENDING)
 		t->flags ^= (_TIF_ABI_PENDING | _TIF_32BIT);
 		t->flags ^= (_TIF_ABI_PENDING | _TIF_32BIT);
 
 
-	if (t->task->mm) {
-		unsigned long pgd_cache = 0UL;
-		if (test_thread_flag(TIF_32BIT)) {
-			struct mm_struct *mm = t->task->mm;
-			pgd_t *pgd0 = &mm->pgd[0];
-			pud_t *pud0 = pud_offset(pgd0, 0);
+	mm = t->task->mm;
+	if (mm)
+		tsb_context_switch(mm);
 
 
-			if (pud_none(*pud0)) {
-				pmd_t *page = pmd_alloc_one(mm, 0);
-				pud_set(pud0, page);
-			}
-			pgd_cache = get_pgd_cache(pgd0);
-		}
-		__asm__ __volatile__("stxa %0, [%1] %2\n\t"
-				     "membar #Sync"
-				     : /* no outputs */
-				     : "r" (pgd_cache),
-				     "r" (TSB_REG),
-				     "i" (ASI_DMMU));
-	}
 	set_thread_wsaved(0);
 	set_thread_wsaved(0);
 
 
 	/* Turn off performance counters if on. */
 	/* Turn off performance counters if on. */
@@ -555,6 +520,18 @@ void synchronize_user_stack(void)
 	}
 	}
 }
 }
 
 
+static void stack_unaligned(unsigned long sp)
+{
+	siginfo_t info;
+
+	info.si_signo = SIGBUS;
+	info.si_errno = 0;
+	info.si_code = BUS_ADRALN;
+	info.si_addr = (void __user *) sp;
+	info.si_trapno = 0;
+	force_sig_info(SIGBUS, &info, current);
+}
+
 void fault_in_user_windows(void)
 void fault_in_user_windows(void)
 {
 {
 	struct thread_info *t = current_thread_info();
 	struct thread_info *t = current_thread_info();
@@ -570,13 +547,17 @@ void fault_in_user_windows(void)
 	flush_user_windows();
 	flush_user_windows();
 	window = get_thread_wsaved();
 	window = get_thread_wsaved();
 
 
-	if (window != 0) {
+	if (likely(window != 0)) {
 		window -= 1;
 		window -= 1;
 		do {
 		do {
 			unsigned long sp = (t->rwbuf_stkptrs[window] + bias);
 			unsigned long sp = (t->rwbuf_stkptrs[window] + bias);
 			struct reg_window *rwin = &t->reg_window[window];
 			struct reg_window *rwin = &t->reg_window[window];
 
 
-			if (copy_to_user((char __user *)sp, rwin, winsize))
+			if (unlikely(sp & 0x7UL))
+				stack_unaligned(sp);
+
+			if (unlikely(copy_to_user((char __user *)sp,
+						  rwin, winsize)))
 				goto barf;
 				goto barf;
 		} while (window--);
 		} while (window--);
 	}
 	}

+ 3 - 0
arch/sparc64/kernel/ptrace.c

@@ -124,6 +124,9 @@ void flush_ptrace_access(struct vm_area_struct *vma, struct page *page,
 {
 {
 	BUG_ON(len > PAGE_SIZE);
 	BUG_ON(len > PAGE_SIZE);
 
 
+	if (tlb_type == hypervisor)
+		return;
+
 #ifdef DCACHE_ALIASING_POSSIBLE
 #ifdef DCACHE_ALIASING_POSSIBLE
 	/* If bit 13 of the kernel address we used to access the
 	/* If bit 13 of the kernel address we used to access the
 	 * user page is the same as the virtual address that page
 	 * user page is the same as the virtual address that page

+ 105 - 10
arch/sparc64/kernel/rtrap.S

@@ -223,12 +223,26 @@ rt_continue:	ldx			[%sp + PTREGS_OFF + PT_V9_G1], %g1
 		ldx			[%sp + PTREGS_OFF + PT_V9_G3], %g3
 		ldx			[%sp + PTREGS_OFF + PT_V9_G3], %g3
 		ldx			[%sp + PTREGS_OFF + PT_V9_G4], %g4
 		ldx			[%sp + PTREGS_OFF + PT_V9_G4], %g4
 		ldx			[%sp + PTREGS_OFF + PT_V9_G5], %g5
 		ldx			[%sp + PTREGS_OFF + PT_V9_G5], %g5
-		mov			TSB_REG, %g6
-		brnz,a,pn		%l3, 1f
-		 ldxa			[%g6] ASI_IMMU, %g5
-1:		ldx			[%sp + PTREGS_OFF + PT_V9_G6], %g6
+		brz,pt			%l3, 1f
+		mov			%g6, %l2
+
+		/* Must do this before thread reg is clobbered below.  */
+		LOAD_PER_CPU_BASE(%g5, %g6, %i0, %i1, %i2)
+1:
+		ldx			[%sp + PTREGS_OFF + PT_V9_G6], %g6
 		ldx			[%sp + PTREGS_OFF + PT_V9_G7], %g7
 		ldx			[%sp + PTREGS_OFF + PT_V9_G7], %g7
-		wrpr			%g0, RTRAP_PSTATE_AG_IRQOFF, %pstate
+
+		/* Normal globals are restored, go to trap globals.  */
+661:		wrpr			%g0, RTRAP_PSTATE_AG_IRQOFF, %pstate
+		nop
+		.section		.sun4v_2insn_patch, "ax"
+		.word			661b
+		wrpr			%g0, RTRAP_PSTATE_IRQOFF, %pstate
+		SET_GL(1)
+		.previous
+
+		mov			%l2, %g6
+
 		ldx			[%sp + PTREGS_OFF + PT_V9_I0], %i0
 		ldx			[%sp + PTREGS_OFF + PT_V9_I0], %i0
 		ldx			[%sp + PTREGS_OFF + PT_V9_I1], %i1
 		ldx			[%sp + PTREGS_OFF + PT_V9_I1], %i1
 
 
@@ -252,27 +266,108 @@ rt_continue:	ldx			[%sp + PTREGS_OFF + PT_V9_G1], %g1
 
 
 		brnz,pn			%l3, kern_rtt
 		brnz,pn			%l3, kern_rtt
 		 mov			PRIMARY_CONTEXT, %l7
 		 mov			PRIMARY_CONTEXT, %l7
-		ldxa			[%l7 + %l7] ASI_DMMU, %l0
+
+661:		ldxa			[%l7 + %l7] ASI_DMMU, %l0
+		.section		.sun4v_1insn_patch, "ax"
+		.word			661b
+		ldxa			[%l7 + %l7] ASI_MMU, %l0
+		.previous
+
 		sethi			%hi(sparc64_kern_pri_nuc_bits), %l1
 		sethi			%hi(sparc64_kern_pri_nuc_bits), %l1
 		ldx			[%l1 + %lo(sparc64_kern_pri_nuc_bits)], %l1
 		ldx			[%l1 + %lo(sparc64_kern_pri_nuc_bits)], %l1
 		or			%l0, %l1, %l0
 		or			%l0, %l1, %l0
-		stxa			%l0, [%l7] ASI_DMMU
-		flush			%g6
+
+661:		stxa			%l0, [%l7] ASI_DMMU
+		.section		.sun4v_1insn_patch, "ax"
+		.word			661b
+		stxa			%l0, [%l7] ASI_MMU
+		.previous
+
+		sethi			%hi(KERNBASE), %l7
+		flush			%l7
 		rdpr			%wstate, %l1
 		rdpr			%wstate, %l1
 		rdpr			%otherwin, %l2
 		rdpr			%otherwin, %l2
 		srl			%l1, 3, %l1
 		srl			%l1, 3, %l1
 
 
 		wrpr			%l2, %g0, %canrestore
 		wrpr			%l2, %g0, %canrestore
 		wrpr			%l1, %g0, %wstate
 		wrpr			%l1, %g0, %wstate
-		wrpr			%g0, %g0, %otherwin
+		brnz,pt			%l2, user_rtt_restore
+		 wrpr			%g0, %g0, %otherwin
+
+		ldx			[%g6 + TI_FLAGS], %g3
+		wr			%g0, ASI_AIUP, %asi
+		rdpr			%cwp, %g1
+		andcc			%g3, _TIF_32BIT, %g0
+		sub			%g1, 1, %g1
+		bne,pt			%xcc, user_rtt_fill_32bit
+		 wrpr			%g1, %cwp
+		ba,a,pt			%xcc, user_rtt_fill_64bit
+
+user_rtt_fill_fixup:
+		rdpr	%cwp, %g1
+		add	%g1, 1, %g1
+		wrpr	%g1, 0x0, %cwp
+
+		rdpr	%wstate, %g2
+		sll	%g2, 3, %g2
+		wrpr	%g2, 0x0, %wstate
+
+		/* We know %canrestore and %otherwin are both zero.  */
+
+		sethi	%hi(sparc64_kern_pri_context), %g2
+		ldx	[%g2 + %lo(sparc64_kern_pri_context)], %g2
+		mov	PRIMARY_CONTEXT, %g1
+
+661:		stxa	%g2, [%g1] ASI_DMMU
+		.section .sun4v_1insn_patch, "ax"
+		.word	661b
+		stxa	%g2, [%g1] ASI_MMU
+		.previous
+
+		sethi	%hi(KERNBASE), %g1
+		flush	%g1
+
+		or	%g4, FAULT_CODE_WINFIXUP, %g4
+		stb	%g4, [%g6 + TI_FAULT_CODE]
+		stx	%g5, [%g6 + TI_FAULT_ADDR]
+
+		mov	%g6, %l1
+		wrpr	%g0, 0x0, %tl
+
+661:		nop
+		.section		.sun4v_1insn_patch, "ax"
+		.word			661b
+		SET_GL(0)
+		.previous
+
+		wrpr	%g0, RTRAP_PSTATE, %pstate
+
+		mov	%l1, %g6
+		ldx	[%g6 + TI_TASK], %g4
+		LOAD_PER_CPU_BASE(%g5, %g6, %g1, %g2, %g3)
+		call	do_sparc64_fault
+		 add	%sp, PTREGS_OFF, %o0
+		ba,pt	%xcc, rtrap
+		 nop
+
+user_rtt_pre_restore:
+		add			%g1, 1, %g1
+		wrpr			%g1, 0x0, %cwp
+
+user_rtt_restore:
 		restore
 		restore
 		rdpr			%canrestore, %g1
 		rdpr			%canrestore, %g1
 		wrpr			%g1, 0x0, %cleanwin
 		wrpr			%g1, 0x0, %cleanwin
 		retry
 		retry
 		nop
 		nop
 
 
-kern_rtt:	restore
+kern_rtt:	rdpr			%canrestore, %g1
+		brz,pn			%g1, kern_rtt_fill
+		 nop
+kern_rtt_restore:
+		restore
 		retry
 		retry
+
 to_kernel:
 to_kernel:
 #ifdef CONFIG_PREEMPT
 #ifdef CONFIG_PREEMPT
 		ldsw			[%g6 + TI_PRE_COUNT], %l5
 		ldsw			[%g6 + TI_PRE_COUNT], %l5

+ 5 - 5
arch/sparc64/kernel/sbus.c

@@ -693,11 +693,11 @@ void sbus_set_sbus64(struct sbus_dev *sdev, int bursts)
 
 
 /* SBUS SYSIO INO number to Sparc PIL level. */
 /* SBUS SYSIO INO number to Sparc PIL level. */
 static unsigned char sysio_ino_to_pil[] = {
 static unsigned char sysio_ino_to_pil[] = {
-	0, 4, 4, 7, 5, 7, 8, 9,		/* SBUS slot 0 */
-	0, 4, 4, 7, 5, 7, 8, 9,		/* SBUS slot 1 */
-	0, 4, 4, 7, 5, 7, 8, 9,		/* SBUS slot 2 */
-	0, 4, 4, 7, 5, 7, 8, 9,		/* SBUS slot 3 */
-	4, /* Onboard SCSI */
+	0, 5, 5, 7, 5, 7, 8, 9,		/* SBUS slot 0 */
+	0, 5, 5, 7, 5, 7, 8, 9,		/* SBUS slot 1 */
+	0, 5, 5, 7, 5, 7, 8, 9,		/* SBUS slot 2 */
+	0, 5, 5, 7, 5, 7, 8, 9,		/* SBUS slot 3 */
+	5, /* Onboard SCSI */
 	5, /* Onboard Ethernet */
 	5, /* Onboard Ethernet */
 /*XXX*/	8, /* Onboard BPP */
 /*XXX*/	8, /* Onboard BPP */
 	0, /* Bogon */
 	0, /* Bogon */

+ 124 - 285
arch/sparc64/kernel/setup.c

@@ -64,12 +64,6 @@ struct screen_info screen_info = {
 	16                      /* orig-video-points */
 	16                      /* orig-video-points */
 };
 };
 
 
-/* Typing sync at the prom prompt calls the function pointed to by
- * the sync callback which I set to the following function.
- * This should sync all filesystems and return, for now it just
- * prints out pretty messages and returns.
- */
-
 void (*prom_palette)(int);
 void (*prom_palette)(int);
 void (*prom_keyboard)(void);
 void (*prom_keyboard)(void);
 
 
@@ -79,259 +73,6 @@ prom_console_write(struct console *con, const char *s, unsigned n)
 	prom_write(s, n);
 	prom_write(s, n);
 }
 }
 
 
-static struct console prom_console = {
-	.name =		"prom",
-	.write =	prom_console_write,
-	.flags =	CON_CONSDEV | CON_ENABLED,
-	.index =	-1,
-};
-
-#define PROM_TRUE	-1
-#define PROM_FALSE	0
-
-/* Pretty sick eh? */
-int prom_callback(long *args)
-{
-	struct console *cons, *saved_console = NULL;
-	unsigned long flags;
-	char *cmd;
-	extern spinlock_t prom_entry_lock;
-
-	if (!args)
-		return -1;
-	if (!(cmd = (char *)args[0]))
-		return -1;
-
-	/*
-	 * The callback can be invoked on the cpu that first dropped 
-	 * into prom_cmdline after taking the serial interrupt, or on 
-	 * a slave processor that was smp_captured() if the 
-	 * administrator has done a switch-cpu inside obp. In either 
-	 * case, the cpu is marked as in-interrupt. Drop IRQ locks.
-	 */
-	irq_exit();
-
-	/* XXX Revisit the locking here someday.  This is a debugging
-	 * XXX feature so it isnt all that critical.  -DaveM
-	 */
-	local_irq_save(flags);
-
-	spin_unlock(&prom_entry_lock);
-	cons = console_drivers;
-	while (cons) {
-		unregister_console(cons);
-		cons->flags &= ~(CON_PRINTBUFFER);
-		cons->next = saved_console;
-		saved_console = cons;
-		cons = console_drivers;
-	}
-	register_console(&prom_console);
-	if (!strcmp(cmd, "sync")) {
-		prom_printf("PROM `%s' command...\n", cmd);
-		show_free_areas();
-		if (current->pid != 0) {
-			local_irq_enable();
-			sys_sync();
-			local_irq_disable();
-		}
-		args[2] = 0;
-		args[args[1] + 3] = -1;
-		prom_printf("Returning to PROM\n");
-	} else if (!strcmp(cmd, "va>tte-data")) {
-		unsigned long ctx, va;
-		unsigned long tte = 0;
-		long res = PROM_FALSE;
-
-		ctx = args[3];
-		va = args[4];
-		if (ctx) {
-			/*
-			 * Find process owning ctx, lookup mapping.
-			 */
-			struct task_struct *p;
-			struct mm_struct *mm = NULL;
-			pgd_t *pgdp;
-			pud_t *pudp;
-			pmd_t *pmdp;
-			pte_t *ptep;
-			pte_t pte;
-
-			for_each_process(p) {
-				mm = p->mm;
-				if (CTX_NRBITS(mm->context) == ctx)
-					break;
-			}
-			if (!mm ||
-			    CTX_NRBITS(mm->context) != ctx)
-				goto done;
-
-			pgdp = pgd_offset(mm, va);
-			if (pgd_none(*pgdp))
-				goto done;
-			pudp = pud_offset(pgdp, va);
-			if (pud_none(*pudp))
-				goto done;
-			pmdp = pmd_offset(pudp, va);
-			if (pmd_none(*pmdp))
-				goto done;
-
-			/* Preemption implicitly disabled by virtue of
-			 * being called from inside OBP.
-			 */
-			ptep = pte_offset_map(pmdp, va);
-			pte = *ptep;
-			if (pte_present(pte)) {
-				tte = pte_val(pte);
-				res = PROM_TRUE;
-			}
-			pte_unmap(ptep);
-			goto done;
-		}
-
-		if ((va >= KERNBASE) && (va < (KERNBASE + (4 * 1024 * 1024)))) {
-			extern unsigned long sparc64_kern_pri_context;
-
-			/* Spitfire Errata #32 workaround */
-			__asm__ __volatile__("stxa	%0, [%1] %2\n\t"
-					     "flush	%%g6"
-					     : /* No outputs */
-					     : "r" (sparc64_kern_pri_context),
-					       "r" (PRIMARY_CONTEXT),
-					       "i" (ASI_DMMU));
-
-			/*
-			 * Locked down tlb entry.
-			 */
-
-			if (tlb_type == spitfire)
-				tte = spitfire_get_dtlb_data(SPITFIRE_HIGHEST_LOCKED_TLBENT);
-			else if (tlb_type == cheetah || tlb_type == cheetah_plus)
-				tte = cheetah_get_ldtlb_data(CHEETAH_HIGHEST_LOCKED_TLBENT);
-
-			res = PROM_TRUE;
-			goto done;
-		}
-
-		if (va < PGDIR_SIZE) {
-			/*
-			 * vmalloc or prom_inherited mapping.
-			 */
-			pgd_t *pgdp;
-			pud_t *pudp;
-			pmd_t *pmdp;
-			pte_t *ptep;
-			pte_t pte;
-			int error;
-
-			if ((va >= LOW_OBP_ADDRESS) && (va < HI_OBP_ADDRESS)) {
-				tte = prom_virt_to_phys(va, &error);
-				if (!error)
-					res = PROM_TRUE;
-				goto done;
-			}
-			pgdp = pgd_offset_k(va);
-			if (pgd_none(*pgdp))
-				goto done;
-			pudp = pud_offset(pgdp, va);
-			if (pud_none(*pudp))
-				goto done;
-			pmdp = pmd_offset(pudp, va);
-			if (pmd_none(*pmdp))
-				goto done;
-
-			/* Preemption implicitly disabled by virtue of
-			 * being called from inside OBP.
-			 */
-			ptep = pte_offset_kernel(pmdp, va);
-			pte = *ptep;
-			if (pte_present(pte)) {
-				tte = pte_val(pte);
-				res = PROM_TRUE;
-			}
-			goto done;
-		}
-
-		if (va < PAGE_OFFSET) {
-			/*
-			 * No mappings here.
-			 */
-			goto done;
-		}
-
-		if (va & (1UL << 40)) {
-			/*
-			 * I/O page.
-			 */
-
-			tte = (__pa(va) & _PAGE_PADDR) |
-			      _PAGE_VALID | _PAGE_SZ4MB |
-			      _PAGE_E | _PAGE_P | _PAGE_W;
-			res = PROM_TRUE;
-			goto done;
-		}
-
-		/*
-		 * Normal page.
-		 */
-		tte = (__pa(va) & _PAGE_PADDR) |
-		      _PAGE_VALID | _PAGE_SZ4MB |
-		      _PAGE_CP | _PAGE_CV | _PAGE_P | _PAGE_W;
-		res = PROM_TRUE;
-
-	done:
-		if (res == PROM_TRUE) {
-			args[2] = 3;
-			args[args[1] + 3] = 0;
-			args[args[1] + 4] = res;
-			args[args[1] + 5] = tte;
-		} else {
-			args[2] = 2;
-			args[args[1] + 3] = 0;
-			args[args[1] + 4] = res;
-		}
-	} else if (!strcmp(cmd, ".soft1")) {
-		unsigned long tte;
-
-		tte = args[3];
-		prom_printf("%lx:\"%s%s%s%s%s\" ",
-			    (tte & _PAGE_SOFT) >> 7,
-			    tte & _PAGE_MODIFIED ? "M" : "-",
-			    tte & _PAGE_ACCESSED ? "A" : "-",
-			    tte & _PAGE_READ     ? "W" : "-",
-			    tte & _PAGE_WRITE    ? "R" : "-",
-			    tte & _PAGE_PRESENT  ? "P" : "-");
-
-		args[2] = 2;
-		args[args[1] + 3] = 0;
-		args[args[1] + 4] = PROM_TRUE;
-	} else if (!strcmp(cmd, ".soft2")) {
-		unsigned long tte;
-
-		tte = args[3];
-		prom_printf("%lx ", (tte & 0x07FC000000000000UL) >> 50);
-
-		args[2] = 2;
-		args[args[1] + 3] = 0;
-		args[args[1] + 4] = PROM_TRUE;
-	} else {
-		prom_printf("unknown PROM `%s' command...\n", cmd);
-	}
-	unregister_console(&prom_console);
-	while (saved_console) {
-		cons = saved_console;
-		saved_console = cons->next;
-		register_console(cons);
-	}
-	spin_lock(&prom_entry_lock);
-	local_irq_restore(flags);
-
-	/*
-	 * Restore in-interrupt status for a resume from obp.
-	 */
-	irq_enter();
-	return 0;
-}
-
 unsigned int boot_flags = 0;
 unsigned int boot_flags = 0;
 #define BOOTME_DEBUG  0x1
 #define BOOTME_DEBUG  0x1
 #define BOOTME_SINGLE 0x2
 #define BOOTME_SINGLE 0x2
@@ -479,15 +220,99 @@ char reboot_command[COMMAND_LINE_SIZE];
 
 
 static struct pt_regs fake_swapper_regs = { { 0, }, 0, 0, 0, 0 };
 static struct pt_regs fake_swapper_regs = { { 0, }, 0, 0, 0, 0 };
 
 
-void register_prom_callbacks(void)
+static void __init per_cpu_patch(void)
 {
 {
-	prom_setcallback(prom_callback);
-	prom_feval(": linux-va>tte-data 2 \" va>tte-data\" $callback drop ; "
-		   "' linux-va>tte-data to va>tte-data");
-	prom_feval(": linux-.soft1 1 \" .soft1\" $callback 2drop ; "
-		   "' linux-.soft1 to .soft1");
-	prom_feval(": linux-.soft2 1 \" .soft2\" $callback 2drop ; "
-		   "' linux-.soft2 to .soft2");
+	struct cpuid_patch_entry *p;
+	unsigned long ver;
+	int is_jbus;
+
+	if (tlb_type == spitfire && !this_is_starfire)
+		return;
+
+	is_jbus = 0;
+	if (tlb_type != hypervisor) {
+		__asm__ ("rdpr %%ver, %0" : "=r" (ver));
+		is_jbus = ((ver >> 32UL) == __JALAPENO_ID ||
+			   (ver >> 32UL) == __SERRANO_ID);
+	}
+
+	p = &__cpuid_patch;
+	while (p < &__cpuid_patch_end) {
+		unsigned long addr = p->addr;
+		unsigned int *insns;
+
+		switch (tlb_type) {
+		case spitfire:
+			insns = &p->starfire[0];
+			break;
+		case cheetah:
+		case cheetah_plus:
+			if (is_jbus)
+				insns = &p->cheetah_jbus[0];
+			else
+				insns = &p->cheetah_safari[0];
+			break;
+		case hypervisor:
+			insns = &p->sun4v[0];
+			break;
+		default:
+			prom_printf("Unknown cpu type, halting.\n");
+			prom_halt();
+		};
+
+		*(unsigned int *) (addr +  0) = insns[0];
+		wmb();
+		__asm__ __volatile__("flush	%0" : : "r" (addr +  0));
+
+		*(unsigned int *) (addr +  4) = insns[1];
+		wmb();
+		__asm__ __volatile__("flush	%0" : : "r" (addr +  4));
+
+		*(unsigned int *) (addr +  8) = insns[2];
+		wmb();
+		__asm__ __volatile__("flush	%0" : : "r" (addr +  8));
+
+		*(unsigned int *) (addr + 12) = insns[3];
+		wmb();
+		__asm__ __volatile__("flush	%0" : : "r" (addr + 12));
+
+		p++;
+	}
+}
+
+static void __init sun4v_patch(void)
+{
+	struct sun4v_1insn_patch_entry *p1;
+	struct sun4v_2insn_patch_entry *p2;
+
+	if (tlb_type != hypervisor)
+		return;
+
+	p1 = &__sun4v_1insn_patch;
+	while (p1 < &__sun4v_1insn_patch_end) {
+		unsigned long addr = p1->addr;
+
+		*(unsigned int *) (addr +  0) = p1->insn;
+		wmb();
+		__asm__ __volatile__("flush	%0" : : "r" (addr +  0));
+
+		p1++;
+	}
+
+	p2 = &__sun4v_2insn_patch;
+	while (p2 < &__sun4v_2insn_patch_end) {
+		unsigned long addr = p2->addr;
+
+		*(unsigned int *) (addr +  0) = p2->insns[0];
+		wmb();
+		__asm__ __volatile__("flush	%0" : : "r" (addr +  0));
+
+		*(unsigned int *) (addr +  4) = p2->insns[1];
+		wmb();
+		__asm__ __volatile__("flush	%0" : : "r" (addr +  4));
+
+		p2++;
+	}
 }
 }
 
 
 void __init setup_arch(char **cmdline_p)
 void __init setup_arch(char **cmdline_p)
@@ -496,7 +321,10 @@ void __init setup_arch(char **cmdline_p)
 	*cmdline_p = prom_getbootargs();
 	*cmdline_p = prom_getbootargs();
 	strcpy(saved_command_line, *cmdline_p);
 	strcpy(saved_command_line, *cmdline_p);
 
 
-	printk("ARCH: SUN4U\n");
+	if (tlb_type == hypervisor)
+		printk("ARCH: SUN4V\n");
+	else
+		printk("ARCH: SUN4U\n");
 
 
 #ifdef CONFIG_DUMMY_CONSOLE
 #ifdef CONFIG_DUMMY_CONSOLE
 	conswitchp = &dummy_con;
 	conswitchp = &dummy_con;
@@ -507,6 +335,13 @@ void __init setup_arch(char **cmdline_p)
 	/* Work out if we are starfire early on */
 	/* Work out if we are starfire early on */
 	check_if_starfire();
 	check_if_starfire();
 
 
+	/* Now we know enough to patch the get_cpuid sequences
+	 * used by trap code.
+	 */
+	per_cpu_patch();
+
+	sun4v_patch();
+
 	boot_flags_init(*cmdline_p);
 	boot_flags_init(*cmdline_p);
 
 
 	idprom_init();
 	idprom_init();
@@ -514,7 +349,7 @@ void __init setup_arch(char **cmdline_p)
 	if (!root_flags)
 	if (!root_flags)
 		root_mountflags &= ~MS_RDONLY;
 		root_mountflags &= ~MS_RDONLY;
 	ROOT_DEV = old_decode_dev(root_dev);
 	ROOT_DEV = old_decode_dev(root_dev);
-#ifdef CONFIG_BLK_DEV_INITRD
+#ifdef CONFIG_BLK_DEV_RAM
 	rd_image_start = ram_flags & RAMDISK_IMAGE_START_MASK;
 	rd_image_start = ram_flags & RAMDISK_IMAGE_START_MASK;
 	rd_prompt = ((ram_flags & RAMDISK_PROMPT_FLAG) != 0);
 	rd_prompt = ((ram_flags & RAMDISK_PROMPT_FLAG) != 0);
 	rd_doload = ((ram_flags & RAMDISK_LOAD_FLAG) != 0);	
 	rd_doload = ((ram_flags & RAMDISK_LOAD_FLAG) != 0);	
@@ -544,6 +379,9 @@ void __init setup_arch(char **cmdline_p)
 
 
 	smp_setup_cpu_possible_map();
 	smp_setup_cpu_possible_map();
 
 
+	/* Get boot processor trap_block[] setup.  */
+	init_cur_cpu_trap(current_thread_info());
+
 	paging_init();
 	paging_init();
 }
 }
 
 
@@ -565,6 +403,12 @@ static int __init set_preferred_console(void)
 		serial_console = 2;
 		serial_console = 2;
 	} else if (idev == PROMDEV_IRSC && odev == PROMDEV_ORSC) {
 	} else if (idev == PROMDEV_IRSC && odev == PROMDEV_ORSC) {
 		serial_console = 3;
 		serial_console = 3;
+	} else if (idev == PROMDEV_IVCONS && odev == PROMDEV_OVCONS) {
+		/* sunhv_console_init() doesn't check the serial_console
+		 * value anyways...
+		 */
+		serial_console = 4;
+		return add_preferred_console("ttyHV", 0, NULL);
 	} else {
 	} else {
 		prom_printf("Inconsistent console: "
 		prom_printf("Inconsistent console: "
 			    "input %d, output %d\n",
 			    "input %d, output %d\n",
@@ -598,9 +442,8 @@ static int show_cpuinfo(struct seq_file *m, void *__unused)
 	seq_printf(m, 
 	seq_printf(m, 
 		   "cpu\t\t: %s\n"
 		   "cpu\t\t: %s\n"
 		   "fpu\t\t: %s\n"
 		   "fpu\t\t: %s\n"
-		   "promlib\t\t: Version 3 Revision %d\n"
-		   "prom\t\t: %d.%d.%d\n"
-		   "type\t\t: sun4u\n"
+		   "prom\t\t: %s\n"
+		   "type\t\t: %s\n"
 		   "ncpus probed\t: %d\n"
 		   "ncpus probed\t: %d\n"
 		   "ncpus active\t: %d\n"
 		   "ncpus active\t: %d\n"
 		   "D$ parity tl1\t: %u\n"
 		   "D$ parity tl1\t: %u\n"
@@ -612,10 +455,10 @@ static int show_cpuinfo(struct seq_file *m, void *__unused)
 		   ,
 		   ,
 		   sparc_cpu_type,
 		   sparc_cpu_type,
 		   sparc_fpu_type,
 		   sparc_fpu_type,
-		   prom_rev,
-		   prom_prev >> 16,
-		   (prom_prev >> 8) & 0xff,
-		   prom_prev & 0xff,
+		   prom_version,
+		   ((tlb_type == hypervisor) ?
+		    "sun4v" :
+		    "sun4u"),
 		   ncpus_probed,
 		   ncpus_probed,
 		   num_online_cpus(),
 		   num_online_cpus(),
 		   dcache_parity_tl1_occurred,
 		   dcache_parity_tl1_occurred,
@@ -692,15 +535,11 @@ static int __init topology_init(void)
 	while (!cpu_find_by_instance(ncpus_probed, NULL, NULL))
 	while (!cpu_find_by_instance(ncpus_probed, NULL, NULL))
 		ncpus_probed++;
 		ncpus_probed++;
 
 
-	for (i = 0; i < NR_CPUS; i++) {
-		if (cpu_possible(i)) {
-			struct cpu *p = kmalloc(sizeof(*p), GFP_KERNEL);
-
-			if (p) {
-				memset(p, 0, sizeof(*p));
-				register_cpu(p, i, NULL);
-				err = 0;
-			}
+	for_each_cpu(i) {
+		struct cpu *p = kzalloc(sizeof(*p), GFP_KERNEL);
+		if (p) {
+			register_cpu(p, i, NULL);
+			err = 0;
 		}
 		}
 	}
 	}
 
 

+ 314 - 104
arch/sparc64/kernel/smp.c

@@ -38,6 +38,7 @@
 #include <asm/timer.h>
 #include <asm/timer.h>
 #include <asm/starfire.h>
 #include <asm/starfire.h>
 #include <asm/tlb.h>
 #include <asm/tlb.h>
+#include <asm/sections.h>
 
 
 extern void calibrate_delay(void);
 extern void calibrate_delay(void);
 
 
@@ -46,6 +47,8 @@ static unsigned char boot_cpu_id;
 
 
 cpumask_t cpu_online_map __read_mostly = CPU_MASK_NONE;
 cpumask_t cpu_online_map __read_mostly = CPU_MASK_NONE;
 cpumask_t phys_cpu_present_map __read_mostly = CPU_MASK_NONE;
 cpumask_t phys_cpu_present_map __read_mostly = CPU_MASK_NONE;
+cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly =
+	{ [0 ... NR_CPUS-1] = CPU_MASK_NONE };
 static cpumask_t smp_commenced_mask;
 static cpumask_t smp_commenced_mask;
 static cpumask_t cpu_callout_map;
 static cpumask_t cpu_callout_map;
 
 
@@ -77,7 +80,7 @@ void smp_bogo(struct seq_file *m)
 
 
 void __init smp_store_cpu_info(int id)
 void __init smp_store_cpu_info(int id)
 {
 {
-	int cpu_node;
+	int cpu_node, def;
 
 
 	/* multiplier and counter set by
 	/* multiplier and counter set by
 	   smp_setup_percpu_timer()  */
 	   smp_setup_percpu_timer()  */
@@ -87,24 +90,32 @@ void __init smp_store_cpu_info(int id)
 	cpu_data(id).clock_tick = prom_getintdefault(cpu_node,
 	cpu_data(id).clock_tick = prom_getintdefault(cpu_node,
 						     "clock-frequency", 0);
 						     "clock-frequency", 0);
 
 
-	cpu_data(id).pgcache_size		= 0;
-	cpu_data(id).pte_cache[0]		= NULL;
-	cpu_data(id).pte_cache[1]		= NULL;
-	cpu_data(id).pgd_cache			= NULL;
-	cpu_data(id).idle_volume		= 1;
-
+	def = ((tlb_type == hypervisor) ? (8 * 1024) : (16 * 1024));
 	cpu_data(id).dcache_size = prom_getintdefault(cpu_node, "dcache-size",
 	cpu_data(id).dcache_size = prom_getintdefault(cpu_node, "dcache-size",
-						      16 * 1024);
+						      def);
+
+	def = 32;
 	cpu_data(id).dcache_line_size =
 	cpu_data(id).dcache_line_size =
-		prom_getintdefault(cpu_node, "dcache-line-size", 32);
+		prom_getintdefault(cpu_node, "dcache-line-size", def);
+
+	def = 16 * 1024;
 	cpu_data(id).icache_size = prom_getintdefault(cpu_node, "icache-size",
 	cpu_data(id).icache_size = prom_getintdefault(cpu_node, "icache-size",
-						      16 * 1024);
+						      def);
+
+	def = 32;
 	cpu_data(id).icache_line_size =
 	cpu_data(id).icache_line_size =
-		prom_getintdefault(cpu_node, "icache-line-size", 32);
+		prom_getintdefault(cpu_node, "icache-line-size", def);
+
+	def = ((tlb_type == hypervisor) ?
+	       (3 * 1024 * 1024) :
+	       (4 * 1024 * 1024));
 	cpu_data(id).ecache_size = prom_getintdefault(cpu_node, "ecache-size",
 	cpu_data(id).ecache_size = prom_getintdefault(cpu_node, "ecache-size",
-						      4 * 1024 * 1024);
+						      def);
+
+	def = 64;
 	cpu_data(id).ecache_line_size =
 	cpu_data(id).ecache_line_size =
-		prom_getintdefault(cpu_node, "ecache-line-size", 64);
+		prom_getintdefault(cpu_node, "ecache-line-size", def);
+
 	printk("CPU[%d]: Caches "
 	printk("CPU[%d]: Caches "
 	       "D[sz(%d):line_sz(%d)] "
 	       "D[sz(%d):line_sz(%d)] "
 	       "I[sz(%d):line_sz(%d)] "
 	       "I[sz(%d):line_sz(%d)] "
@@ -119,27 +130,16 @@ static void smp_setup_percpu_timer(void);
 
 
 static volatile unsigned long callin_flag = 0;
 static volatile unsigned long callin_flag = 0;
 
 
-extern void inherit_locked_prom_mappings(int save_p);
-
-static inline void cpu_setup_percpu_base(unsigned long cpu_id)
-{
-	__asm__ __volatile__("mov	%0, %%g5\n\t"
-			     "stxa	%0, [%1] %2\n\t"
-			     "membar	#Sync"
-			     : /* no outputs */
-			     : "r" (__per_cpu_offset(cpu_id)),
-			       "r" (TSB_REG), "i" (ASI_IMMU));
-}
-
 void __init smp_callin(void)
 void __init smp_callin(void)
 {
 {
 	int cpuid = hard_smp_processor_id();
 	int cpuid = hard_smp_processor_id();
 
 
-	inherit_locked_prom_mappings(0);
+	__local_per_cpu_offset = __per_cpu_offset(cpuid);
 
 
-	__flush_tlb_all();
+	if (tlb_type == hypervisor)
+		sun4v_ktsb_register();
 
 
-	cpu_setup_percpu_base(cpuid);
+	__flush_tlb_all();
 
 
 	smp_setup_percpu_timer();
 	smp_setup_percpu_timer();
 
 
@@ -316,6 +316,8 @@ static void smp_synchronize_one_tick(int cpu)
 	spin_unlock_irqrestore(&itc_sync_lock, flags);
 	spin_unlock_irqrestore(&itc_sync_lock, flags);
 }
 }
 
 
+extern void sun4v_init_mondo_queues(int use_bootmem, int cpu, int alloc, int load);
+
 extern unsigned long sparc64_cpu_startup;
 extern unsigned long sparc64_cpu_startup;
 
 
 /* The OBP cpu startup callback truncates the 3rd arg cookie to
 /* The OBP cpu startup callback truncates the 3rd arg cookie to
@@ -331,21 +333,31 @@ static int __devinit smp_boot_one_cpu(unsigned int cpu)
 	unsigned long cookie =
 	unsigned long cookie =
 		(unsigned long)(&cpu_new_thread);
 		(unsigned long)(&cpu_new_thread);
 	struct task_struct *p;
 	struct task_struct *p;
-	int timeout, ret, cpu_node;
+	int timeout, ret;
 
 
 	p = fork_idle(cpu);
 	p = fork_idle(cpu);
 	callin_flag = 0;
 	callin_flag = 0;
 	cpu_new_thread = task_thread_info(p);
 	cpu_new_thread = task_thread_info(p);
 	cpu_set(cpu, cpu_callout_map);
 	cpu_set(cpu, cpu_callout_map);
 
 
-	cpu_find_by_mid(cpu, &cpu_node);
-	prom_startcpu(cpu_node, entry, cookie);
+	if (tlb_type == hypervisor) {
+		/* Alloc the mondo queues, cpu will load them.  */
+		sun4v_init_mondo_queues(0, cpu, 1, 0);
+
+		prom_startcpu_cpuid(cpu, entry, cookie);
+	} else {
+		int cpu_node;
+
+		cpu_find_by_mid(cpu, &cpu_node);
+		prom_startcpu(cpu_node, entry, cookie);
+	}
 
 
 	for (timeout = 0; timeout < 5000000; timeout++) {
 	for (timeout = 0; timeout < 5000000; timeout++) {
 		if (callin_flag)
 		if (callin_flag)
 			break;
 			break;
 		udelay(100);
 		udelay(100);
 	}
 	}
+
 	if (callin_flag) {
 	if (callin_flag) {
 		ret = 0;
 		ret = 0;
 	} else {
 	} else {
@@ -441,7 +453,7 @@ static __inline__ void spitfire_xcall_deliver(u64 data0, u64 data1, u64 data2, c
 static void cheetah_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t mask)
 static void cheetah_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t mask)
 {
 {
 	u64 pstate, ver;
 	u64 pstate, ver;
-	int nack_busy_id, is_jalapeno;
+	int nack_busy_id, is_jbus;
 
 
 	if (cpus_empty(mask))
 	if (cpus_empty(mask))
 		return;
 		return;
@@ -451,7 +463,8 @@ static void cheetah_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t mas
 	 * derivative processor.
 	 * derivative processor.
 	 */
 	 */
 	__asm__ ("rdpr %%ver, %0" : "=r" (ver));
 	__asm__ ("rdpr %%ver, %0" : "=r" (ver));
-	is_jalapeno = ((ver >> 32) == 0x003e0016);
+	is_jbus = ((ver >> 32) == __JALAPENO_ID ||
+		   (ver >> 32) == __SERRANO_ID);
 
 
 	__asm__ __volatile__("rdpr %%pstate, %0" : "=r" (pstate));
 	__asm__ __volatile__("rdpr %%pstate, %0" : "=r" (pstate));
 
 
@@ -476,7 +489,7 @@ retry:
 		for_each_cpu_mask(i, mask) {
 		for_each_cpu_mask(i, mask) {
 			u64 target = (i << 14) | 0x70;
 			u64 target = (i << 14) | 0x70;
 
 
-			if (!is_jalapeno)
+			if (!is_jbus)
 				target |= (nack_busy_id << 24);
 				target |= (nack_busy_id << 24);
 			__asm__ __volatile__(
 			__asm__ __volatile__(
 				"stxa	%%g0, [%0] %1\n\t"
 				"stxa	%%g0, [%0] %1\n\t"
@@ -529,7 +542,7 @@ retry:
 			for_each_cpu_mask(i, mask) {
 			for_each_cpu_mask(i, mask) {
 				u64 check_mask;
 				u64 check_mask;
 
 
-				if (is_jalapeno)
+				if (is_jbus)
 					check_mask = (0x2UL << (2*i));
 					check_mask = (0x2UL << (2*i));
 				else
 				else
 					check_mask = (0x2UL <<
 					check_mask = (0x2UL <<
@@ -544,6 +557,155 @@ retry:
 	}
 	}
 }
 }
 
 
+/* Multi-cpu list version.  */
+static void hypervisor_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t mask)
+{
+	struct trap_per_cpu *tb;
+	u16 *cpu_list;
+	u64 *mondo;
+	cpumask_t error_mask;
+	unsigned long flags, status;
+	int cnt, retries, this_cpu, prev_sent, i;
+
+	/* We have to do this whole thing with interrupts fully disabled.
+	 * Otherwise if we send an xcall from interrupt context it will
+	 * corrupt both our mondo block and cpu list state.
+	 *
+	 * One consequence of this is that we cannot use timeout mechanisms
+	 * that depend upon interrupts being delivered locally.  So, for
+	 * example, we cannot sample jiffies and expect it to advance.
+	 *
+	 * Fortunately, udelay() uses %stick/%tick so we can use that.
+	 */
+	local_irq_save(flags);
+
+	this_cpu = smp_processor_id();
+	tb = &trap_block[this_cpu];
+
+	mondo = __va(tb->cpu_mondo_block_pa);
+	mondo[0] = data0;
+	mondo[1] = data1;
+	mondo[2] = data2;
+	wmb();
+
+	cpu_list = __va(tb->cpu_list_pa);
+
+	/* Setup the initial cpu list.  */
+	cnt = 0;
+	for_each_cpu_mask(i, mask)
+		cpu_list[cnt++] = i;
+
+	cpus_clear(error_mask);
+	retries = 0;
+	prev_sent = 0;
+	do {
+		int forward_progress, n_sent;
+
+		status = sun4v_cpu_mondo_send(cnt,
+					      tb->cpu_list_pa,
+					      tb->cpu_mondo_block_pa);
+
+		/* HV_EOK means all cpus received the xcall, we're done.  */
+		if (likely(status == HV_EOK))
+			break;
+
+		/* First, see if we made any forward progress.
+		 *
+		 * The hypervisor indicates successful sends by setting
+		 * cpu list entries to the value 0xffff.
+		 */
+		n_sent = 0;
+		for (i = 0; i < cnt; i++) {
+			if (likely(cpu_list[i] == 0xffff))
+				n_sent++;
+		}
+
+		forward_progress = 0;
+		if (n_sent > prev_sent)
+			forward_progress = 1;
+
+		prev_sent = n_sent;
+
+		/* If we get a HV_ECPUERROR, then one or more of the cpus
+		 * in the list are in error state.  Use the cpu_state()
+		 * hypervisor call to find out which cpus are in error state.
+		 */
+		if (unlikely(status == HV_ECPUERROR)) {
+			for (i = 0; i < cnt; i++) {
+				long err;
+				u16 cpu;
+
+				cpu = cpu_list[i];
+				if (cpu == 0xffff)
+					continue;
+
+				err = sun4v_cpu_state(cpu);
+				if (err >= 0 &&
+				    err == HV_CPU_STATE_ERROR) {
+					cpu_list[i] = 0xffff;
+					cpu_set(cpu, error_mask);
+				}
+			}
+		} else if (unlikely(status != HV_EWOULDBLOCK))
+			goto fatal_mondo_error;
+
+		/* Don't bother rewriting the CPU list, just leave the
+		 * 0xffff and non-0xffff entries in there and the
+		 * hypervisor will do the right thing.
+		 *
+		 * Only advance timeout state if we didn't make any
+		 * forward progress.
+		 */
+		if (unlikely(!forward_progress)) {
+			if (unlikely(++retries > 10000))
+				goto fatal_mondo_timeout;
+
+			/* Delay a little bit to let other cpus catch up
+			 * on their cpu mondo queue work.
+			 */
+			udelay(2 * cnt);
+		}
+	} while (1);
+
+	local_irq_restore(flags);
+
+	if (unlikely(!cpus_empty(error_mask)))
+		goto fatal_mondo_cpu_error;
+
+	return;
+
+fatal_mondo_cpu_error:
+	printk(KERN_CRIT "CPU[%d]: SUN4V mondo cpu error, some target cpus "
+	       "were in error state\n",
+	       this_cpu);
+	printk(KERN_CRIT "CPU[%d]: Error mask [ ", this_cpu);
+	for_each_cpu_mask(i, error_mask)
+		printk("%d ", i);
+	printk("]\n");
+	return;
+
+fatal_mondo_timeout:
+	local_irq_restore(flags);
+	printk(KERN_CRIT "CPU[%d]: SUN4V mondo timeout, no forward "
+	       " progress after %d retries.\n",
+	       this_cpu, retries);
+	goto dump_cpu_list_and_out;
+
+fatal_mondo_error:
+	local_irq_restore(flags);
+	printk(KERN_CRIT "CPU[%d]: Unexpected SUN4V mondo error %lu\n",
+	       this_cpu, status);
+	printk(KERN_CRIT "CPU[%d]: Args were cnt(%d) cpulist_pa(%lx) "
+	       "mondo_block_pa(%lx)\n",
+	       this_cpu, cnt, tb->cpu_list_pa, tb->cpu_mondo_block_pa);
+
+dump_cpu_list_and_out:
+	printk(KERN_CRIT "CPU[%d]: CPU list [ ", this_cpu);
+	for (i = 0; i < cnt; i++)
+		printk("%u ", cpu_list[i]);
+	printk("]\n");
+}
+
 /* Send cross call to all processors mentioned in MASK
 /* Send cross call to all processors mentioned in MASK
  * except self.
  * except self.
  */
  */
@@ -557,8 +719,10 @@ static void smp_cross_call_masked(unsigned long *func, u32 ctx, u64 data1, u64 d
 
 
 	if (tlb_type == spitfire)
 	if (tlb_type == spitfire)
 		spitfire_xcall_deliver(data0, data1, data2, mask);
 		spitfire_xcall_deliver(data0, data1, data2, mask);
-	else
+	else if (tlb_type == cheetah || tlb_type == cheetah_plus)
 		cheetah_xcall_deliver(data0, data1, data2, mask);
 		cheetah_xcall_deliver(data0, data1, data2, mask);
+	else
+		hypervisor_xcall_deliver(data0, data1, data2, mask);
 	/* NOTE: Caller runs local copy on master. */
 	/* NOTE: Caller runs local copy on master. */
 
 
 	put_cpu();
 	put_cpu();
@@ -594,16 +758,13 @@ extern unsigned long xcall_call_function;
  * You must not call this function with disabled interrupts or from a
  * You must not call this function with disabled interrupts or from a
  * hardware interrupt handler or from a bottom half handler.
  * hardware interrupt handler or from a bottom half handler.
  */
  */
-int smp_call_function(void (*func)(void *info), void *info,
-		      int nonatomic, int wait)
+static int smp_call_function_mask(void (*func)(void *info), void *info,
+				  int nonatomic, int wait, cpumask_t mask)
 {
 {
 	struct call_data_struct data;
 	struct call_data_struct data;
-	int cpus = num_online_cpus() - 1;
+	int cpus;
 	long timeout;
 	long timeout;
 
 
-	if (!cpus)
-		return 0;
-
 	/* Can deadlock when called with interrupts disabled */
 	/* Can deadlock when called with interrupts disabled */
 	WARN_ON(irqs_disabled());
 	WARN_ON(irqs_disabled());
 
 
@@ -614,9 +775,14 @@ int smp_call_function(void (*func)(void *info), void *info,
 
 
 	spin_lock(&call_lock);
 	spin_lock(&call_lock);
 
 
+	cpu_clear(smp_processor_id(), mask);
+	cpus = cpus_weight(mask);
+	if (!cpus)
+		goto out_unlock;
+
 	call_data = &data;
 	call_data = &data;
 
 
-	smp_cross_call(&xcall_call_function, 0, 0, 0);
+	smp_cross_call_masked(&xcall_call_function, 0, 0, 0, mask);
 
 
 	/* 
 	/* 
 	 * Wait for other cpus to complete function or at
 	 * Wait for other cpus to complete function or at
@@ -630,18 +796,25 @@ int smp_call_function(void (*func)(void *info), void *info,
 		udelay(1);
 		udelay(1);
 	}
 	}
 
 
+out_unlock:
 	spin_unlock(&call_lock);
 	spin_unlock(&call_lock);
 
 
 	return 0;
 	return 0;
 
 
 out_timeout:
 out_timeout:
 	spin_unlock(&call_lock);
 	spin_unlock(&call_lock);
-	printk("XCALL: Remote cpus not responding, ncpus=%ld finished=%ld\n",
-	       (long) num_online_cpus() - 1L,
-	       (long) atomic_read(&data.finished));
+	printk("XCALL: Remote cpus not responding, ncpus=%d finished=%d\n",
+	       cpus, atomic_read(&data.finished));
 	return 0;
 	return 0;
 }
 }
 
 
+int smp_call_function(void (*func)(void *info), void *info,
+		      int nonatomic, int wait)
+{
+	return smp_call_function_mask(func, info, nonatomic, wait,
+				      cpu_online_map);
+}
+
 void smp_call_function_client(int irq, struct pt_regs *regs)
 void smp_call_function_client(int irq, struct pt_regs *regs)
 {
 {
 	void (*func) (void *info) = call_data->func;
 	void (*func) (void *info) = call_data->func;
@@ -659,13 +832,25 @@ void smp_call_function_client(int irq, struct pt_regs *regs)
 	}
 	}
 }
 }
 
 
+static void tsb_sync(void *info)
+{
+	struct mm_struct *mm = info;
+
+	if (current->active_mm == mm)
+		tsb_context_switch(mm);
+}
+
+void smp_tsb_sync(struct mm_struct *mm)
+{
+	smp_call_function_mask(tsb_sync, mm, 0, 1, mm->cpu_vm_mask);
+}
+
 extern unsigned long xcall_flush_tlb_mm;
 extern unsigned long xcall_flush_tlb_mm;
 extern unsigned long xcall_flush_tlb_pending;
 extern unsigned long xcall_flush_tlb_pending;
 extern unsigned long xcall_flush_tlb_kernel_range;
 extern unsigned long xcall_flush_tlb_kernel_range;
-extern unsigned long xcall_flush_tlb_all_spitfire;
-extern unsigned long xcall_flush_tlb_all_cheetah;
 extern unsigned long xcall_report_regs;
 extern unsigned long xcall_report_regs;
 extern unsigned long xcall_receive_signal;
 extern unsigned long xcall_receive_signal;
+extern unsigned long xcall_new_mmu_context_version;
 
 
 #ifdef DCACHE_ALIASING_POSSIBLE
 #ifdef DCACHE_ALIASING_POSSIBLE
 extern unsigned long xcall_flush_dcache_page_cheetah;
 extern unsigned long xcall_flush_dcache_page_cheetah;
@@ -693,11 +878,17 @@ static __inline__ void __local_flush_dcache_page(struct page *page)
 void smp_flush_dcache_page_impl(struct page *page, int cpu)
 void smp_flush_dcache_page_impl(struct page *page, int cpu)
 {
 {
 	cpumask_t mask = cpumask_of_cpu(cpu);
 	cpumask_t mask = cpumask_of_cpu(cpu);
-	int this_cpu = get_cpu();
+	int this_cpu;
+
+	if (tlb_type == hypervisor)
+		return;
 
 
 #ifdef CONFIG_DEBUG_DCFLUSH
 #ifdef CONFIG_DEBUG_DCFLUSH
 	atomic_inc(&dcpage_flushes);
 	atomic_inc(&dcpage_flushes);
 #endif
 #endif
+
+	this_cpu = get_cpu();
+
 	if (cpu == this_cpu) {
 	if (cpu == this_cpu) {
 		__local_flush_dcache_page(page);
 		__local_flush_dcache_page(page);
 	} else if (cpu_online(cpu)) {
 	} else if (cpu_online(cpu)) {
@@ -713,7 +904,7 @@ void smp_flush_dcache_page_impl(struct page *page, int cpu)
 					       __pa(pg_addr),
 					       __pa(pg_addr),
 					       (u64) pg_addr,
 					       (u64) pg_addr,
 					       mask);
 					       mask);
-		} else {
+		} else if (tlb_type == cheetah || tlb_type == cheetah_plus) {
 #ifdef DCACHE_ALIASING_POSSIBLE
 #ifdef DCACHE_ALIASING_POSSIBLE
 			data0 =
 			data0 =
 				((u64)&xcall_flush_dcache_page_cheetah);
 				((u64)&xcall_flush_dcache_page_cheetah);
@@ -735,7 +926,12 @@ void flush_dcache_page_all(struct mm_struct *mm, struct page *page)
 	void *pg_addr = page_address(page);
 	void *pg_addr = page_address(page);
 	cpumask_t mask = cpu_online_map;
 	cpumask_t mask = cpu_online_map;
 	u64 data0;
 	u64 data0;
-	int this_cpu = get_cpu();
+	int this_cpu;
+
+	if (tlb_type == hypervisor)
+		return;
+
+	this_cpu = get_cpu();
 
 
 	cpu_clear(this_cpu, mask);
 	cpu_clear(this_cpu, mask);
 
 
@@ -752,7 +948,7 @@ void flush_dcache_page_all(struct mm_struct *mm, struct page *page)
 				       __pa(pg_addr),
 				       __pa(pg_addr),
 				       (u64) pg_addr,
 				       (u64) pg_addr,
 				       mask);
 				       mask);
-	} else {
+	} else if (tlb_type == cheetah || tlb_type == cheetah_plus) {
 #ifdef DCACHE_ALIASING_POSSIBLE
 #ifdef DCACHE_ALIASING_POSSIBLE
 		data0 = ((u64)&xcall_flush_dcache_page_cheetah);
 		data0 = ((u64)&xcall_flush_dcache_page_cheetah);
 		cheetah_xcall_deliver(data0,
 		cheetah_xcall_deliver(data0,
@@ -769,38 +965,58 @@ void flush_dcache_page_all(struct mm_struct *mm, struct page *page)
 	put_cpu();
 	put_cpu();
 }
 }
 
 
+static void __smp_receive_signal_mask(cpumask_t mask)
+{
+	smp_cross_call_masked(&xcall_receive_signal, 0, 0, 0, mask);
+}
+
 void smp_receive_signal(int cpu)
 void smp_receive_signal(int cpu)
 {
 {
 	cpumask_t mask = cpumask_of_cpu(cpu);
 	cpumask_t mask = cpumask_of_cpu(cpu);
 
 
-	if (cpu_online(cpu)) {
-		u64 data0 = (((u64)&xcall_receive_signal) & 0xffffffff);
-
-		if (tlb_type == spitfire)
-			spitfire_xcall_deliver(data0, 0, 0, mask);
-		else
-			cheetah_xcall_deliver(data0, 0, 0, mask);
-	}
+	if (cpu_online(cpu))
+		__smp_receive_signal_mask(mask);
 }
 }
 
 
 void smp_receive_signal_client(int irq, struct pt_regs *regs)
 void smp_receive_signal_client(int irq, struct pt_regs *regs)
 {
 {
-	/* Just return, rtrap takes care of the rest. */
 	clear_softint(1 << irq);
 	clear_softint(1 << irq);
 }
 }
 
 
-void smp_report_regs(void)
+void smp_new_mmu_context_version_client(int irq, struct pt_regs *regs)
 {
 {
-	smp_cross_call(&xcall_report_regs, 0, 0, 0);
+	struct mm_struct *mm;
+	unsigned long flags;
+
+	clear_softint(1 << irq);
+
+	/* See if we need to allocate a new TLB context because
+	 * the version of the one we are using is now out of date.
+	 */
+	mm = current->active_mm;
+	if (unlikely(!mm || (mm == &init_mm)))
+		return;
+
+	spin_lock_irqsave(&mm->context.lock, flags);
+
+	if (unlikely(!CTX_VALID(mm->context)))
+		get_new_mmu_context(mm);
+
+	spin_unlock_irqrestore(&mm->context.lock, flags);
+
+	load_secondary_context(mm);
+	__flush_tlb_mm(CTX_HWBITS(mm->context),
+		       SECONDARY_CONTEXT);
 }
 }
 
 
-void smp_flush_tlb_all(void)
+void smp_new_mmu_context_version(void)
 {
 {
-	if (tlb_type == spitfire)
-		smp_cross_call(&xcall_flush_tlb_all_spitfire, 0, 0, 0);
-	else
-		smp_cross_call(&xcall_flush_tlb_all_cheetah, 0, 0, 0);
-	__flush_tlb_all();
+	smp_cross_call(&xcall_new_mmu_context_version, 0, 0, 0);
+}
+
+void smp_report_regs(void)
+{
+	smp_cross_call(&xcall_report_regs, 0, 0, 0);
 }
 }
 
 
 /* We know that the window frames of the user have been flushed
 /* We know that the window frames of the user have been flushed
@@ -944,24 +1160,19 @@ void smp_release(void)
  * can service tlb flush xcalls...
  * can service tlb flush xcalls...
  */
  */
 extern void prom_world(int);
 extern void prom_world(int);
-extern void save_alternate_globals(unsigned long *);
-extern void restore_alternate_globals(unsigned long *);
+
 void smp_penguin_jailcell(int irq, struct pt_regs *regs)
 void smp_penguin_jailcell(int irq, struct pt_regs *regs)
 {
 {
-	unsigned long global_save[24];
-
 	clear_softint(1 << irq);
 	clear_softint(1 << irq);
 
 
 	preempt_disable();
 	preempt_disable();
 
 
 	__asm__ __volatile__("flushw");
 	__asm__ __volatile__("flushw");
-	save_alternate_globals(global_save);
 	prom_world(1);
 	prom_world(1);
 	atomic_inc(&smp_capture_registry);
 	atomic_inc(&smp_capture_registry);
 	membar_storeload_storestore();
 	membar_storeload_storestore();
 	while (penguins_are_doing_time)
 	while (penguins_are_doing_time)
 		rmb();
 		rmb();
-	restore_alternate_globals(global_save);
 	atomic_dec(&smp_capture_registry);
 	atomic_dec(&smp_capture_registry);
 	prom_world(0);
 	prom_world(0);
 
 
@@ -1082,6 +1293,8 @@ int setup_profiling_timer(unsigned int multiplier)
 /* Constrain the number of cpus to max_cpus.  */
 /* Constrain the number of cpus to max_cpus.  */
 void __init smp_prepare_cpus(unsigned int max_cpus)
 void __init smp_prepare_cpus(unsigned int max_cpus)
 {
 {
+	int i;
+
 	if (num_possible_cpus() > max_cpus) {
 	if (num_possible_cpus() > max_cpus) {
 		int instance, mid;
 		int instance, mid;
 
 
@@ -1096,6 +1309,20 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 		}
 		}
 	}
 	}
 
 
+	for_each_cpu(i) {
+		if (tlb_type == hypervisor) {
+			int j;
+
+			/* XXX get this mapping from machine description */
+			for_each_cpu(j) {
+				if ((j >> 2) == (i >> 2))
+					cpu_set(j, cpu_sibling_map[i]);
+			}
+		} else {
+			cpu_set(i, cpu_sibling_map[i]);
+		}
+	}
+
 	smp_store_cpu_info(boot_cpu_id);
 	smp_store_cpu_info(boot_cpu_id);
 }
 }
 
 
@@ -1117,12 +1344,15 @@ void __init smp_setup_cpu_possible_map(void)
 
 
 void __devinit smp_prepare_boot_cpu(void)
 void __devinit smp_prepare_boot_cpu(void)
 {
 {
-	if (hard_smp_processor_id() >= NR_CPUS) {
+	int cpu = hard_smp_processor_id();
+
+	if (cpu >= NR_CPUS) {
 		prom_printf("Serious problem, boot cpu id >= NR_CPUS\n");
 		prom_printf("Serious problem, boot cpu id >= NR_CPUS\n");
 		prom_halt();
 		prom_halt();
 	}
 	}
 
 
-	current_thread_info()->cpu = hard_smp_processor_id();
+	current_thread_info()->cpu = cpu;
+	__local_per_cpu_offset = __per_cpu_offset(cpu);
 
 
 	cpu_set(smp_processor_id(), cpu_online_map);
 	cpu_set(smp_processor_id(), cpu_online_map);
 	cpu_set(smp_processor_id(), phys_cpu_present_map);
 	cpu_set(smp_processor_id(), phys_cpu_present_map);
@@ -1139,7 +1369,11 @@ int __devinit __cpu_up(unsigned int cpu)
 		if (!cpu_isset(cpu, cpu_online_map)) {
 		if (!cpu_isset(cpu, cpu_online_map)) {
 			ret = -ENODEV;
 			ret = -ENODEV;
 		} else {
 		} else {
-			smp_synchronize_one_tick(cpu);
+			/* On SUN4V, writes to %tick and %stick are
+			 * not allowed.
+			 */
+			if (tlb_type != hypervisor)
+				smp_synchronize_one_tick(cpu);
 		}
 		}
 	}
 	}
 	return ret;
 	return ret;
@@ -1183,12 +1417,9 @@ void __init setup_per_cpu_areas(void)
 {
 {
 	unsigned long goal, size, i;
 	unsigned long goal, size, i;
 	char *ptr;
 	char *ptr;
-	/* Created by linker magic */
-	extern char __per_cpu_start[], __per_cpu_end[];
 
 
 	/* Copy section for each CPU (we discard the original) */
 	/* Copy section for each CPU (we discard the original) */
-	goal = ALIGN(__per_cpu_end - __per_cpu_start, PAGE_SIZE);
-
+	goal = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
 #ifdef CONFIG_MODULES
 #ifdef CONFIG_MODULES
 	if (goal < PERCPU_ENOUGH_ROOM)
 	if (goal < PERCPU_ENOUGH_ROOM)
 		goal = PERCPU_ENOUGH_ROOM;
 		goal = PERCPU_ENOUGH_ROOM;
@@ -1197,31 +1428,10 @@ void __init setup_per_cpu_areas(void)
 	for (size = 1UL; size < goal; size <<= 1UL)
 	for (size = 1UL; size < goal; size <<= 1UL)
 		__per_cpu_shift++;
 		__per_cpu_shift++;
 
 
-	/* Make sure the resulting __per_cpu_base value
-	 * will fit in the 43-bit sign extended IMMU
-	 * TSB register.
-	 */
-	ptr = __alloc_bootmem(size * NR_CPUS, PAGE_SIZE,
-			      (unsigned long) __per_cpu_start);
+	ptr = alloc_bootmem(size * NR_CPUS);
 
 
 	__per_cpu_base = ptr - __per_cpu_start;
 	__per_cpu_base = ptr - __per_cpu_start;
 
 
-	if ((__per_cpu_shift < PAGE_SHIFT) ||
-	    (__per_cpu_base & ~PAGE_MASK) ||
-	    (__per_cpu_base != (((long) __per_cpu_base << 20) >> 20))) {
-		prom_printf("PER_CPU: Invalid layout, "
-			    "ptr[%p] shift[%lx] base[%lx]\n",
-			    ptr, __per_cpu_shift, __per_cpu_base);
-		prom_halt();
-	}
-
 	for (i = 0; i < NR_CPUS; i++, ptr += size)
 	for (i = 0; i < NR_CPUS; i++, ptr += size)
 		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
 		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
-
-	/* Finally, load in the boot cpu's base value.
-	 * We abuse the IMMU TSB register for trap handler
-	 * entry and exit loading of %g5.  That is why it
-	 * has to be page aligned.
-	 */
-	cpu_setup_percpu_base(hard_smp_processor_id());
 }
 }

+ 14 - 12
arch/sparc64/kernel/sparc64_ksyms.c

@@ -95,9 +95,6 @@ extern int __ashrdi3(int, int);
 
 
 extern int dump_fpu (struct pt_regs * regs, elf_fpregset_t * fpregs);
 extern int dump_fpu (struct pt_regs * regs, elf_fpregset_t * fpregs);
 
 
-extern unsigned long phys_base;
-extern unsigned long pfn_base;
-
 extern unsigned int sys_call_table[];
 extern unsigned int sys_call_table[];
 
 
 extern void xor_vis_2(unsigned long, unsigned long *, unsigned long *);
 extern void xor_vis_2(unsigned long, unsigned long *, unsigned long *);
@@ -108,6 +105,14 @@ extern void xor_vis_4(unsigned long, unsigned long *, unsigned long *,
 extern void xor_vis_5(unsigned long, unsigned long *, unsigned long *,
 extern void xor_vis_5(unsigned long, unsigned long *, unsigned long *,
 		      unsigned long *, unsigned long *, unsigned long *);
 		      unsigned long *, unsigned long *, unsigned long *);
 
 
+extern void xor_niagara_2(unsigned long, unsigned long *, unsigned long *);
+extern void xor_niagara_3(unsigned long, unsigned long *, unsigned long *,
+			  unsigned long *);
+extern void xor_niagara_4(unsigned long, unsigned long *, unsigned long *,
+			  unsigned long *, unsigned long *);
+extern void xor_niagara_5(unsigned long, unsigned long *, unsigned long *,
+			  unsigned long *, unsigned long *, unsigned long *);
+
 /* Per-CPU information table */
 /* Per-CPU information table */
 EXPORT_PER_CPU_SYMBOL(__cpu_data);
 EXPORT_PER_CPU_SYMBOL(__cpu_data);
 
 
@@ -241,10 +246,6 @@ EXPORT_SYMBOL(verify_compat_iovec);
 #endif
 #endif
 
 
 EXPORT_SYMBOL(dump_fpu);
 EXPORT_SYMBOL(dump_fpu);
-EXPORT_SYMBOL(pte_alloc_one_kernel);
-#ifndef CONFIG_SMP
-EXPORT_SYMBOL(pgt_quicklists);
-#endif
 EXPORT_SYMBOL(put_fs_struct);
 EXPORT_SYMBOL(put_fs_struct);
 
 
 /* math-emu wants this */
 /* math-emu wants this */
@@ -339,14 +340,10 @@ EXPORT_SYMBOL(copy_to_user_fixup);
 EXPORT_SYMBOL(copy_from_user_fixup);
 EXPORT_SYMBOL(copy_from_user_fixup);
 EXPORT_SYMBOL(copy_in_user_fixup);
 EXPORT_SYMBOL(copy_in_user_fixup);
 EXPORT_SYMBOL(__strncpy_from_user);
 EXPORT_SYMBOL(__strncpy_from_user);
-EXPORT_SYMBOL(__bzero_noasi);
+EXPORT_SYMBOL(__clear_user);
 
 
 /* Various address conversion macros use this. */
 /* Various address conversion macros use this. */
-EXPORT_SYMBOL(phys_base);
-EXPORT_SYMBOL(pfn_base);
 EXPORT_SYMBOL(sparc64_valid_addr_bitmap);
 EXPORT_SYMBOL(sparc64_valid_addr_bitmap);
-EXPORT_SYMBOL(page_to_pfn);
-EXPORT_SYMBOL(pfn_to_page);
 
 
 /* No version information on this, heavily used in inline asm,
 /* No version information on this, heavily used in inline asm,
  * and will always be 'void __ret_efault(void)'.
  * and will always be 'void __ret_efault(void)'.
@@ -392,4 +389,9 @@ EXPORT_SYMBOL(xor_vis_3);
 EXPORT_SYMBOL(xor_vis_4);
 EXPORT_SYMBOL(xor_vis_4);
 EXPORT_SYMBOL(xor_vis_5);
 EXPORT_SYMBOL(xor_vis_5);
 
 
+EXPORT_SYMBOL(xor_niagara_2);
+EXPORT_SYMBOL(xor_niagara_3);
+EXPORT_SYMBOL(xor_niagara_4);
+EXPORT_SYMBOL(xor_niagara_5);
+
 EXPORT_SYMBOL(prom_palette);
 EXPORT_SYMBOL(prom_palette);

+ 334 - 0
arch/sparc64/kernel/sun4v_ivec.S

@@ -0,0 +1,334 @@
+/* sun4v_ivec.S: Sun4v interrupt vector handling.
+ *
+ * Copyright (C) 2006 <davem@davemloft.net>
+ */
+
+#include <asm/cpudata.h>
+#include <asm/intr_queue.h>
+
+	.text
+	.align	32
+
+sun4v_cpu_mondo:
+	/* Head offset in %g2, tail offset in %g4.
+	 * If they are the same, no work.
+	 */
+	mov	INTRQ_CPU_MONDO_HEAD, %g2
+	ldxa	[%g2] ASI_QUEUE, %g2
+	mov	INTRQ_CPU_MONDO_TAIL, %g4
+	ldxa	[%g4] ASI_QUEUE, %g4
+	cmp	%g2, %g4
+	be,pn	%xcc, sun4v_cpu_mondo_queue_empty
+	 nop
+
+	/* Get &trap_block[smp_processor_id()] into %g3.  */
+	ldxa	[%g0] ASI_SCRATCHPAD, %g3
+	sub	%g3, TRAP_PER_CPU_FAULT_INFO, %g3
+
+	/* Get CPU mondo queue base phys address into %g7.  */
+	ldx	[%g3 + TRAP_PER_CPU_CPU_MONDO_PA], %g7
+
+	/* Now get the cross-call arguments and handler PC, same
+	 * layout as sun4u:
+	 *
+	 * 1st 64-bit word: low half is 32-bit PC, put into %g3 and jmpl to it
+	 *                  high half is context arg to MMU flushes, into %g5
+	 * 2nd 64-bit word: 64-bit arg, load into %g1
+	 * 3rd 64-bit word: 64-bit arg, load into %g7
+	 */
+	ldxa	[%g7 + %g2] ASI_PHYS_USE_EC, %g3
+	add	%g2, 0x8, %g2
+	srlx	%g3, 32, %g5
+	ldxa	[%g7 + %g2] ASI_PHYS_USE_EC, %g1
+	add	%g2, 0x8, %g2
+	srl	%g3, 0, %g3
+	ldxa	[%g7 + %g2] ASI_PHYS_USE_EC, %g7
+	add	%g2, 0x40 - 0x8 - 0x8, %g2
+
+	/* Update queue head pointer.  */
+	sethi	%hi(8192 - 1), %g4
+	or	%g4, %lo(8192 - 1), %g4
+	and	%g2, %g4, %g2
+
+	mov	INTRQ_CPU_MONDO_HEAD, %g4
+	stxa	%g2, [%g4] ASI_QUEUE
+	membar	#Sync
+
+	jmpl	%g3, %g0
+	 nop
+
+sun4v_cpu_mondo_queue_empty:
+	retry
+
+sun4v_dev_mondo:
+	/* Head offset in %g2, tail offset in %g4.  */
+	mov	INTRQ_DEVICE_MONDO_HEAD, %g2
+	ldxa	[%g2] ASI_QUEUE, %g2
+	mov	INTRQ_DEVICE_MONDO_TAIL, %g4
+	ldxa	[%g4] ASI_QUEUE, %g4
+	cmp	%g2, %g4
+	be,pn	%xcc, sun4v_dev_mondo_queue_empty
+	 nop
+
+	/* Get &trap_block[smp_processor_id()] into %g3.  */
+	ldxa	[%g0] ASI_SCRATCHPAD, %g3
+	sub	%g3, TRAP_PER_CPU_FAULT_INFO, %g3
+
+	/* Get DEV mondo queue base phys address into %g5.  */
+	ldx	[%g3 + TRAP_PER_CPU_DEV_MONDO_PA], %g5
+
+	/* Load IVEC into %g3.  */
+	ldxa	[%g5 + %g2] ASI_PHYS_USE_EC, %g3
+	add	%g2, 0x40, %g2
+
+	/* XXX There can be a full 64-byte block of data here.
+	 * XXX This is how we can get at MSI vector data.
+	 * XXX Current we do not capture this, but when we do we'll
+	 * XXX need to add a 64-byte storage area in the struct ino_bucket
+	 * XXX or the struct irq_desc.
+	 */
+
+	/* Update queue head pointer, this frees up some registers.  */
+	sethi	%hi(8192 - 1), %g4
+	or	%g4, %lo(8192 - 1), %g4
+	and	%g2, %g4, %g2
+
+	mov	INTRQ_DEVICE_MONDO_HEAD, %g4
+	stxa	%g2, [%g4] ASI_QUEUE
+	membar	#Sync
+
+	/* Get &__irq_work[smp_processor_id()] into %g1.  */
+	TRAP_LOAD_IRQ_WORK(%g1, %g4)
+
+	/* Get &ivector_table[IVEC] into %g4.  */
+	sethi	%hi(ivector_table), %g4
+	sllx	%g3, 5, %g3
+	or	%g4, %lo(ivector_table), %g4
+	add	%g4, %g3, %g4
+
+	/* Load IRQ %pil into %g5.  */
+	ldub	[%g4 + 0x04], %g5
+
+	/* Insert ivector_table[] entry into __irq_work[] queue.  */
+	sllx	%g5, 2, %g3
+	lduw	[%g1 + %g3], %g2	/* g2 = irq_work(cpu, pil) */
+	stw	%g2, [%g4 + 0x00]	/* bucket->irq_chain = g2 */
+	stw	%g4, [%g1 + %g3]	/* irq_work(cpu, pil) = bucket */
+
+	/* Signal the interrupt by setting (1 << pil) in %softint.  */
+	mov	1, %g2
+	sllx	%g2, %g5, %g2
+	wr	%g2, 0x0, %set_softint
+
+sun4v_dev_mondo_queue_empty:
+	retry
+
+sun4v_res_mondo:
+	/* Head offset in %g2, tail offset in %g4.  */
+	mov	INTRQ_RESUM_MONDO_HEAD, %g2
+	ldxa	[%g2] ASI_QUEUE, %g2
+	mov	INTRQ_RESUM_MONDO_TAIL, %g4
+	ldxa	[%g4] ASI_QUEUE, %g4
+	cmp	%g2, %g4
+	be,pn	%xcc, sun4v_res_mondo_queue_empty
+	 nop
+
+	/* Get &trap_block[smp_processor_id()] into %g3.  */
+	ldxa	[%g0] ASI_SCRATCHPAD, %g3
+	sub	%g3, TRAP_PER_CPU_FAULT_INFO, %g3
+
+	/* Get RES mondo queue base phys address into %g5.  */
+	ldx	[%g3 + TRAP_PER_CPU_RESUM_MONDO_PA], %g5
+
+	/* Get RES kernel buffer base phys address into %g7.  */
+	ldx	[%g3 + TRAP_PER_CPU_RESUM_KBUF_PA], %g7
+
+	/* If the first word is non-zero, queue is full.  */
+	ldxa	[%g7 + %g2] ASI_PHYS_USE_EC, %g1
+	brnz,pn	%g1, sun4v_res_mondo_queue_full
+	 nop
+
+	/* Remember this entry's offset in %g1.  */
+	mov	%g2, %g1
+
+	/* Copy 64-byte queue entry into kernel buffer.  */
+	ldxa	[%g5 + %g2] ASI_PHYS_USE_EC, %g3
+	stxa	%g3, [%g7 + %g2] ASI_PHYS_USE_EC
+	add	%g2, 0x08, %g2
+	ldxa	[%g5 + %g2] ASI_PHYS_USE_EC, %g3
+	stxa	%g3, [%g7 + %g2] ASI_PHYS_USE_EC
+	add	%g2, 0x08, %g2
+	ldxa	[%g5 + %g2] ASI_PHYS_USE_EC, %g3
+	stxa	%g3, [%g7 + %g2] ASI_PHYS_USE_EC
+	add	%g2, 0x08, %g2
+	ldxa	[%g5 + %g2] ASI_PHYS_USE_EC, %g3
+	stxa	%g3, [%g7 + %g2] ASI_PHYS_USE_EC
+	add	%g2, 0x08, %g2
+	ldxa	[%g5 + %g2] ASI_PHYS_USE_EC, %g3
+	stxa	%g3, [%g7 + %g2] ASI_PHYS_USE_EC
+	add	%g2, 0x08, %g2
+	ldxa	[%g5 + %g2] ASI_PHYS_USE_EC, %g3
+	stxa	%g3, [%g7 + %g2] ASI_PHYS_USE_EC
+	add	%g2, 0x08, %g2
+	ldxa	[%g5 + %g2] ASI_PHYS_USE_EC, %g3
+	stxa	%g3, [%g7 + %g2] ASI_PHYS_USE_EC
+	add	%g2, 0x08, %g2
+	ldxa	[%g5 + %g2] ASI_PHYS_USE_EC, %g3
+	stxa	%g3, [%g7 + %g2] ASI_PHYS_USE_EC
+	add	%g2, 0x08, %g2
+
+	/* Update queue head pointer.  */
+	sethi	%hi(8192 - 1), %g4
+	or	%g4, %lo(8192 - 1), %g4
+	and	%g2, %g4, %g2
+
+	mov	INTRQ_RESUM_MONDO_HEAD, %g4
+	stxa	%g2, [%g4] ASI_QUEUE
+	membar	#Sync
+
+	/* Disable interrupts and save register state so we can call
+	 * C code.  The etrap handling will leave %g4 in %l4 for us
+	 * when it's done.
+	 */
+	rdpr	%pil, %g2
+	wrpr	%g0, 15, %pil
+	mov	%g1, %g4
+	ba,pt	%xcc, etrap_irq
+	 rd	%pc, %g7
+
+	/* Log the event.  */
+	add	%sp, PTREGS_OFF, %o0
+	call	sun4v_resum_error
+	 mov	%l4, %o1
+
+	/* Return from trap.  */
+	ba,pt	%xcc, rtrap_irq
+	 nop
+
+sun4v_res_mondo_queue_empty:
+	retry
+
+sun4v_res_mondo_queue_full:
+	/* The queue is full, consolidate our damage by setting
+	 * the head equal to the tail.  We'll just trap again otherwise.
+	 * Call C code to log the event.
+	 */
+	mov	INTRQ_RESUM_MONDO_HEAD, %g2
+	stxa	%g4, [%g2] ASI_QUEUE
+	membar	#Sync
+
+	rdpr	%pil, %g2
+	wrpr	%g0, 15, %pil
+	ba,pt	%xcc, etrap_irq
+	 rd	%pc, %g7
+
+	call	sun4v_resum_overflow
+	 add	%sp, PTREGS_OFF, %o0
+
+	ba,pt	%xcc, rtrap_irq
+	 nop
+
+sun4v_nonres_mondo:
+	/* Head offset in %g2, tail offset in %g4.  */
+	mov	INTRQ_NONRESUM_MONDO_HEAD, %g2
+	ldxa	[%g2] ASI_QUEUE, %g2
+	mov	INTRQ_NONRESUM_MONDO_TAIL, %g4
+	ldxa	[%g4] ASI_QUEUE, %g4
+	cmp	%g2, %g4
+	be,pn	%xcc, sun4v_nonres_mondo_queue_empty
+	 nop
+
+	/* Get &trap_block[smp_processor_id()] into %g3.  */
+	ldxa	[%g0] ASI_SCRATCHPAD, %g3
+	sub	%g3, TRAP_PER_CPU_FAULT_INFO, %g3
+
+	/* Get RES mondo queue base phys address into %g5.  */
+	ldx	[%g3 + TRAP_PER_CPU_NONRESUM_MONDO_PA], %g5
+
+	/* Get RES kernel buffer base phys address into %g7.  */
+	ldx	[%g3 + TRAP_PER_CPU_NONRESUM_KBUF_PA], %g7
+
+	/* If the first word is non-zero, queue is full.  */
+	ldxa	[%g7 + %g2] ASI_PHYS_USE_EC, %g1
+	brnz,pn	%g1, sun4v_nonres_mondo_queue_full
+	 nop
+
+	/* Remember this entry's offset in %g1.  */
+	mov	%g2, %g1
+
+	/* Copy 64-byte queue entry into kernel buffer.  */
+	ldxa	[%g5 + %g2] ASI_PHYS_USE_EC, %g3
+	stxa	%g3, [%g7 + %g2] ASI_PHYS_USE_EC
+	add	%g2, 0x08, %g2
+	ldxa	[%g5 + %g2] ASI_PHYS_USE_EC, %g3
+	stxa	%g3, [%g7 + %g2] ASI_PHYS_USE_EC
+	add	%g2, 0x08, %g2
+	ldxa	[%g5 + %g2] ASI_PHYS_USE_EC, %g3
+	stxa	%g3, [%g7 + %g2] ASI_PHYS_USE_EC
+	add	%g2, 0x08, %g2
+	ldxa	[%g5 + %g2] ASI_PHYS_USE_EC, %g3
+	stxa	%g3, [%g7 + %g2] ASI_PHYS_USE_EC
+	add	%g2, 0x08, %g2
+	ldxa	[%g5 + %g2] ASI_PHYS_USE_EC, %g3
+	stxa	%g3, [%g7 + %g2] ASI_PHYS_USE_EC
+	add	%g2, 0x08, %g2
+	ldxa	[%g5 + %g2] ASI_PHYS_USE_EC, %g3
+	stxa	%g3, [%g7 + %g2] ASI_PHYS_USE_EC
+	add	%g2, 0x08, %g2
+	ldxa	[%g5 + %g2] ASI_PHYS_USE_EC, %g3
+	stxa	%g3, [%g7 + %g2] ASI_PHYS_USE_EC
+	add	%g2, 0x08, %g2
+	ldxa	[%g5 + %g2] ASI_PHYS_USE_EC, %g3
+	stxa	%g3, [%g7 + %g2] ASI_PHYS_USE_EC
+	add	%g2, 0x08, %g2
+
+	/* Update queue head pointer.  */
+	sethi	%hi(8192 - 1), %g4
+	or	%g4, %lo(8192 - 1), %g4
+	and	%g2, %g4, %g2
+
+	mov	INTRQ_NONRESUM_MONDO_HEAD, %g4
+	stxa	%g2, [%g4] ASI_QUEUE
+	membar	#Sync
+
+	/* Disable interrupts and save register state so we can call
+	 * C code.  The etrap handling will leave %g4 in %l4 for us
+	 * when it's done.
+	 */
+	rdpr	%pil, %g2
+	wrpr	%g0, 15, %pil
+	mov	%g1, %g4
+	ba,pt	%xcc, etrap_irq
+	 rd	%pc, %g7
+
+	/* Log the event.  */
+	add	%sp, PTREGS_OFF, %o0
+	call	sun4v_nonresum_error
+	 mov	%l4, %o1
+
+	/* Return from trap.  */
+	ba,pt	%xcc, rtrap_irq
+	 nop
+
+sun4v_nonres_mondo_queue_empty:
+	retry
+
+sun4v_nonres_mondo_queue_full:
+	/* The queue is full, consolidate our damage by setting
+	 * the head equal to the tail.  We'll just trap again otherwise.
+	 * Call C code to log the event.
+	 */
+	mov	INTRQ_NONRESUM_MONDO_HEAD, %g2
+	stxa	%g4, [%g2] ASI_QUEUE
+	membar	#Sync
+
+	rdpr	%pil, %g2
+	wrpr	%g0, 15, %pil
+	ba,pt	%xcc, etrap_irq
+	 rd	%pc, %g7
+
+	call	sun4v_nonresum_overflow
+	 add	%sp, PTREGS_OFF, %o0
+
+	ba,pt	%xcc, rtrap_irq
+	 nop

+ 421 - 0
arch/sparc64/kernel/sun4v_tlb_miss.S

@@ -0,0 +1,421 @@
+/* sun4v_tlb_miss.S: Sun4v TLB miss handlers.
+ *
+ * Copyright (C) 2006 <davem@davemloft.net>
+ */
+
+	.text
+	.align	32
+
+	/* Load ITLB fault information into VADDR and CTX, using BASE.  */
+#define LOAD_ITLB_INFO(BASE, VADDR, CTX) \
+	ldx	[BASE + HV_FAULT_I_ADDR_OFFSET], VADDR; \
+	ldx	[BASE + HV_FAULT_I_CTX_OFFSET], CTX;
+
+	/* Load DTLB fault information into VADDR and CTX, using BASE.  */
+#define LOAD_DTLB_INFO(BASE, VADDR, CTX) \
+	ldx	[BASE + HV_FAULT_D_ADDR_OFFSET], VADDR; \
+	ldx	[BASE + HV_FAULT_D_CTX_OFFSET], CTX;
+
+	/* DEST = (VADDR >> 22)
+	 *
+	 * Branch to ZERO_CTX_LABEL if context is zero.
+	 */
+#define	COMPUTE_TAG_TARGET(DEST, VADDR, CTX, ZERO_CTX_LABEL) \
+	srlx	VADDR, 22, DEST; \
+	brz,pn	CTX, ZERO_CTX_LABEL; \
+	 nop;
+
+	/* Create TSB pointer.  This is something like:
+	 *
+	 * index_mask = (512 << (tsb_reg & 0x7UL)) - 1UL;
+	 * tsb_base = tsb_reg & ~0x7UL;
+	 * tsb_index = ((vaddr >> PAGE_SHIFT) & tsb_mask);
+	 * tsb_ptr = tsb_base + (tsb_index * 16);
+	 */
+#define COMPUTE_TSB_PTR(TSB_PTR, VADDR, TMP1, TMP2)	\
+	and	TSB_PTR, 0x7, TMP1;			\
+	mov	512, TMP2;				\
+	andn	TSB_PTR, 0x7, TSB_PTR;			\
+	sllx	TMP2, TMP1, TMP2;			\
+	srlx	VADDR, PAGE_SHIFT, TMP1;		\
+	sub	TMP2, 1, TMP2;				\
+	and	TMP1, TMP2, TMP1;			\
+	sllx	TMP1, 4, TMP1;				\
+	add	TSB_PTR, TMP1, TSB_PTR;
+
+sun4v_itlb_miss:
+	/* Load MMU Miss base into %g2.  */
+	ldxa	[%g0] ASI_SCRATCHPAD, %g2
+	
+	/* Load UTSB reg into %g1.  */
+	mov	SCRATCHPAD_UTSBREG1, %g1
+	ldxa	[%g1] ASI_SCRATCHPAD, %g1
+
+	LOAD_ITLB_INFO(%g2, %g4, %g5)
+	COMPUTE_TAG_TARGET(%g6, %g4, %g5, kvmap_itlb_4v)
+	COMPUTE_TSB_PTR(%g1, %g4, %g3, %g7)
+
+	/* Load TSB tag/pte into %g2/%g3 and compare the tag.  */
+	ldda	[%g1] ASI_QUAD_LDD_PHYS_4V, %g2
+	cmp	%g2, %g6
+	bne,a,pn %xcc, tsb_miss_page_table_walk
+	 mov	FAULT_CODE_ITLB, %g3
+	andcc	%g3, _PAGE_EXEC_4V, %g0
+	be,a,pn	%xcc, tsb_do_fault
+	 mov	FAULT_CODE_ITLB, %g3
+
+	/* We have a valid entry, make hypervisor call to load
+	 * I-TLB and return from trap.
+	 *
+	 * %g3:	PTE
+	 * %g4:	vaddr
+	 */
+sun4v_itlb_load:
+	ldxa	[%g0] ASI_SCRATCHPAD, %g6
+	mov	%o0, %g1		! save %o0
+	mov	%o1, %g2		! save %o1
+	mov	%o2, %g5		! save %o2
+	mov	%o3, %g7		! save %o3
+	mov	%g4, %o0		! vaddr
+	ldx	[%g6 + HV_FAULT_I_CTX_OFFSET], %o1	! ctx
+	mov	%g3, %o2		! PTE
+	mov	HV_MMU_IMMU, %o3	! flags
+	ta	HV_MMU_MAP_ADDR_TRAP
+	brnz,pn	%o0, sun4v_itlb_error
+	 mov	%g2, %o1		! restore %o1
+	mov	%g1, %o0		! restore %o0
+	mov	%g5, %o2		! restore %o2
+	mov	%g7, %o3		! restore %o3
+
+	retry
+
+sun4v_dtlb_miss:
+	/* Load MMU Miss base into %g2.  */
+	ldxa	[%g0] ASI_SCRATCHPAD, %g2
+	
+	/* Load UTSB reg into %g1.  */
+	mov	SCRATCHPAD_UTSBREG1, %g1
+	ldxa	[%g1] ASI_SCRATCHPAD, %g1
+
+	LOAD_DTLB_INFO(%g2, %g4, %g5)
+	COMPUTE_TAG_TARGET(%g6, %g4, %g5, kvmap_dtlb_4v)
+	COMPUTE_TSB_PTR(%g1, %g4, %g3, %g7)
+
+	/* Load TSB tag/pte into %g2/%g3 and compare the tag.  */
+	ldda	[%g1] ASI_QUAD_LDD_PHYS_4V, %g2
+	cmp	%g2, %g6
+	bne,a,pn %xcc, tsb_miss_page_table_walk
+	 mov	FAULT_CODE_DTLB, %g3
+
+	/* We have a valid entry, make hypervisor call to load
+	 * D-TLB and return from trap.
+	 *
+	 * %g3:	PTE
+	 * %g4:	vaddr
+	 */
+sun4v_dtlb_load:
+	ldxa	[%g0] ASI_SCRATCHPAD, %g6
+	mov	%o0, %g1		! save %o0
+	mov	%o1, %g2		! save %o1
+	mov	%o2, %g5		! save %o2
+	mov	%o3, %g7		! save %o3
+	mov	%g4, %o0		! vaddr
+	ldx	[%g6 + HV_FAULT_D_CTX_OFFSET], %o1	! ctx
+	mov	%g3, %o2		! PTE
+	mov	HV_MMU_DMMU, %o3	! flags
+	ta	HV_MMU_MAP_ADDR_TRAP
+	brnz,pn	%o0, sun4v_dtlb_error
+	 mov	%g2, %o1		! restore %o1
+	mov	%g1, %o0		! restore %o0
+	mov	%g5, %o2		! restore %o2
+	mov	%g7, %o3		! restore %o3
+
+	retry
+
+sun4v_dtlb_prot:
+	SET_GL(1)
+
+	/* Load MMU Miss base into %g5.  */
+	ldxa	[%g0] ASI_SCRATCHPAD, %g5
+	
+	ldx	[%g5 + HV_FAULT_D_ADDR_OFFSET], %g5
+	rdpr	%tl, %g1
+	cmp	%g1, 1
+	bgu,pn	%xcc, winfix_trampoline
+	 nop
+	ba,pt	%xcc, sparc64_realfault_common
+	 mov	FAULT_CODE_DTLB | FAULT_CODE_WRITE, %g4
+
+	/* Called from trap table:
+	 * %g4:	vaddr
+	 * %g5:	context
+	 * %g6: TAG TARGET
+	 */
+sun4v_itsb_miss:
+	mov	SCRATCHPAD_UTSBREG1, %g1
+	ldxa	[%g1] ASI_SCRATCHPAD, %g1
+	brz,pn	%g5, kvmap_itlb_4v
+	 mov	FAULT_CODE_ITLB, %g3
+	ba,a,pt	%xcc, sun4v_tsb_miss_common
+
+	/* Called from trap table:
+	 * %g4:	vaddr
+	 * %g5:	context
+	 * %g6: TAG TARGET
+	 */
+sun4v_dtsb_miss:
+	mov	SCRATCHPAD_UTSBREG1, %g1
+	ldxa	[%g1] ASI_SCRATCHPAD, %g1
+	brz,pn	%g5, kvmap_dtlb_4v
+	 mov	FAULT_CODE_DTLB, %g3
+
+	/* fallthrough */
+
+	/* Create TSB pointer into %g1.  This is something like:
+	 *
+	 * index_mask = (512 << (tsb_reg & 0x7UL)) - 1UL;
+	 * tsb_base = tsb_reg & ~0x7UL;
+	 * tsb_index = ((vaddr >> PAGE_SHIFT) & tsb_mask);
+	 * tsb_ptr = tsb_base + (tsb_index * 16);
+	 */
+sun4v_tsb_miss_common:
+	COMPUTE_TSB_PTR(%g1, %g4, %g5, %g7)
+
+	/* Branch directly to page table lookup.  We have SCRATCHPAD_MMU_MISS
+	 * still in %g2, so it's quite trivial to get at the PGD PHYS value
+	 * so we can preload it into %g7.
+	 */
+	sub	%g2, TRAP_PER_CPU_FAULT_INFO, %g2
+	ba,pt	%xcc, tsb_miss_page_table_walk_sun4v_fastpath
+	 ldx	[%g2 + TRAP_PER_CPU_PGD_PADDR], %g7
+
+sun4v_itlb_error:
+	sethi	%hi(sun4v_err_itlb_vaddr), %g1
+	stx	%g4, [%g1 + %lo(sun4v_err_itlb_vaddr)]
+	sethi	%hi(sun4v_err_itlb_ctx), %g1
+	ldxa	[%g0] ASI_SCRATCHPAD, %g6
+	ldx	[%g6 + HV_FAULT_I_CTX_OFFSET], %o1
+	stx	%o1, [%g1 + %lo(sun4v_err_itlb_ctx)]
+	sethi	%hi(sun4v_err_itlb_pte), %g1
+	stx	%g3, [%g1 + %lo(sun4v_err_itlb_pte)]
+	sethi	%hi(sun4v_err_itlb_error), %g1
+	stx	%o0, [%g1 + %lo(sun4v_err_itlb_error)]
+
+	rdpr	%tl, %g4
+	cmp	%g4, 1
+	ble,pt	%icc, 1f
+	 sethi	%hi(2f), %g7
+	ba,pt	%xcc, etraptl1
+	 or	%g7, %lo(2f), %g7
+
+1:	ba,pt	%xcc, etrap
+2:	 or	%g7, %lo(2b), %g7
+	call	sun4v_itlb_error_report
+	 add	%sp, PTREGS_OFF, %o0
+
+	/* NOTREACHED */
+
+sun4v_dtlb_error:
+	sethi	%hi(sun4v_err_dtlb_vaddr), %g1
+	stx	%g4, [%g1 + %lo(sun4v_err_dtlb_vaddr)]
+	sethi	%hi(sun4v_err_dtlb_ctx), %g1
+	ldxa	[%g0] ASI_SCRATCHPAD, %g6
+	ldx	[%g6 + HV_FAULT_D_CTX_OFFSET], %o1
+	stx	%o1, [%g1 + %lo(sun4v_err_dtlb_ctx)]
+	sethi	%hi(sun4v_err_dtlb_pte), %g1
+	stx	%g3, [%g1 + %lo(sun4v_err_dtlb_pte)]
+	sethi	%hi(sun4v_err_dtlb_error), %g1
+	stx	%o0, [%g1 + %lo(sun4v_err_dtlb_error)]
+
+	rdpr	%tl, %g4
+	cmp	%g4, 1
+	ble,pt	%icc, 1f
+	 sethi	%hi(2f), %g7
+	ba,pt	%xcc, etraptl1
+	 or	%g7, %lo(2f), %g7
+
+1:	ba,pt	%xcc, etrap
+2:	 or	%g7, %lo(2b), %g7
+	call	sun4v_dtlb_error_report
+	 add	%sp, PTREGS_OFF, %o0
+
+	/* NOTREACHED */
+
+	/* Instruction Access Exception, tl0. */
+sun4v_iacc:
+	ldxa	[%g0] ASI_SCRATCHPAD, %g2
+	ldx	[%g2 + HV_FAULT_I_TYPE_OFFSET], %g3
+	ldx	[%g2 + HV_FAULT_I_ADDR_OFFSET], %g4
+	ldx	[%g2 + HV_FAULT_I_CTX_OFFSET], %g5
+	sllx	%g3, 16, %g3
+	or	%g5, %g3, %g5
+	ba,pt	%xcc, etrap
+	 rd	%pc, %g7
+	mov	%l4, %o1
+	mov	%l5, %o2
+	call	sun4v_insn_access_exception
+	 add	%sp, PTREGS_OFF, %o0
+	ba,a,pt	%xcc, rtrap_clr_l6
+
+	/* Instruction Access Exception, tl1. */
+sun4v_iacc_tl1:
+	ldxa	[%g0] ASI_SCRATCHPAD, %g2
+	ldx	[%g2 + HV_FAULT_I_TYPE_OFFSET], %g3
+	ldx	[%g2 + HV_FAULT_I_ADDR_OFFSET], %g4
+	ldx	[%g2 + HV_FAULT_I_CTX_OFFSET], %g5
+	sllx	%g3, 16, %g3
+	or	%g5, %g3, %g5
+	ba,pt	%xcc, etraptl1
+	 rd	%pc, %g7
+	mov	%l4, %o1
+	mov	%l5, %o2
+	call	sun4v_insn_access_exception_tl1
+	 add	%sp, PTREGS_OFF, %o0
+	ba,a,pt	%xcc, rtrap_clr_l6
+
+	/* Data Access Exception, tl0. */
+sun4v_dacc:
+	ldxa	[%g0] ASI_SCRATCHPAD, %g2
+	ldx	[%g2 + HV_FAULT_D_TYPE_OFFSET], %g3
+	ldx	[%g2 + HV_FAULT_D_ADDR_OFFSET], %g4
+	ldx	[%g2 + HV_FAULT_D_CTX_OFFSET], %g5
+	sllx	%g3, 16, %g3
+	or	%g5, %g3, %g5
+	ba,pt	%xcc, etrap
+	 rd	%pc, %g7
+	mov	%l4, %o1
+	mov	%l5, %o2
+	call	sun4v_data_access_exception
+	 add	%sp, PTREGS_OFF, %o0
+	ba,a,pt	%xcc, rtrap_clr_l6
+
+	/* Data Access Exception, tl1. */
+sun4v_dacc_tl1:
+	ldxa	[%g0] ASI_SCRATCHPAD, %g2
+	ldx	[%g2 + HV_FAULT_D_TYPE_OFFSET], %g3
+	ldx	[%g2 + HV_FAULT_D_ADDR_OFFSET], %g4
+	ldx	[%g2 + HV_FAULT_D_CTX_OFFSET], %g5
+	sllx	%g3, 16, %g3
+	or	%g5, %g3, %g5
+	ba,pt	%xcc, etraptl1
+	 rd	%pc, %g7
+	mov	%l4, %o1
+	mov	%l5, %o2
+	call	sun4v_data_access_exception_tl1
+	 add	%sp, PTREGS_OFF, %o0
+	ba,a,pt	%xcc, rtrap_clr_l6
+
+	/* Memory Address Unaligned.  */
+sun4v_mna:
+	/* Window fixup? */
+	rdpr	%tl, %g2
+	cmp	%g2, 1
+	ble,pt	%icc, 1f
+	 nop
+
+	SET_GL(1)
+	ldxa	[%g0] ASI_SCRATCHPAD, %g2
+	ldx	[%g2 + HV_FAULT_D_ADDR_OFFSET], %g5
+	mov	HV_FAULT_TYPE_UNALIGNED, %g3
+	ldx	[%g2 + HV_FAULT_D_CTX_OFFSET], %g4
+	sllx	%g3, 16, %g3
+	or	%g4, %g3, %g4
+	ba,pt	%xcc, winfix_mna
+	 rdpr	%tpc, %g3
+	/* not reached */
+
+1:	ldxa	[%g0] ASI_SCRATCHPAD, %g2
+	mov	HV_FAULT_TYPE_UNALIGNED, %g3
+	ldx	[%g2 + HV_FAULT_D_ADDR_OFFSET], %g4
+	ldx	[%g2 + HV_FAULT_D_CTX_OFFSET], %g5
+	sllx	%g3, 16, %g3
+	or	%g5, %g3, %g5
+
+	ba,pt	%xcc, etrap
+	 rd	%pc, %g7
+	mov	%l4, %o1
+	mov	%l5, %o2
+	call	sun4v_do_mna
+	 add	%sp, PTREGS_OFF, %o0
+	ba,a,pt	%xcc, rtrap_clr_l6
+
+	/* Privileged Action.  */
+sun4v_privact:
+	ba,pt	%xcc, etrap
+	 rd	%pc, %g7
+	call	do_privact
+	 add	%sp, PTREGS_OFF, %o0
+	ba,a,pt	%xcc, rtrap_clr_l6
+
+	/* Unaligned ldd float, tl0. */
+sun4v_lddfmna:
+	ldxa	[%g0] ASI_SCRATCHPAD, %g2
+	ldx	[%g2 + HV_FAULT_D_TYPE_OFFSET], %g3
+	ldx	[%g2 + HV_FAULT_D_ADDR_OFFSET], %g4
+	ldx	[%g2 + HV_FAULT_D_CTX_OFFSET], %g5
+	sllx	%g3, 16, %g3
+	or	%g5, %g3, %g5
+	ba,pt	%xcc, etrap
+	 rd	%pc, %g7
+	mov	%l4, %o1
+	mov	%l5, %o2
+	call	handle_lddfmna
+	 add	%sp, PTREGS_OFF, %o0
+	ba,a,pt	%xcc, rtrap_clr_l6
+
+	/* Unaligned std float, tl0. */
+sun4v_stdfmna:
+	ldxa	[%g0] ASI_SCRATCHPAD, %g2
+	ldx	[%g2 + HV_FAULT_D_TYPE_OFFSET], %g3
+	ldx	[%g2 + HV_FAULT_D_ADDR_OFFSET], %g4
+	ldx	[%g2 + HV_FAULT_D_CTX_OFFSET], %g5
+	sllx	%g3, 16, %g3
+	or	%g5, %g3, %g5
+	ba,pt	%xcc, etrap
+	 rd	%pc, %g7
+	mov	%l4, %o1
+	mov	%l5, %o2
+	call	handle_stdfmna
+	 add	%sp, PTREGS_OFF, %o0
+	ba,a,pt	%xcc, rtrap_clr_l6
+
+#define BRANCH_ALWAYS	0x10680000
+#define NOP		0x01000000
+#define SUN4V_DO_PATCH(OLD, NEW)	\
+	sethi	%hi(NEW), %g1; \
+	or	%g1, %lo(NEW), %g1; \
+	sethi	%hi(OLD), %g2; \
+	or	%g2, %lo(OLD), %g2; \
+	sub	%g1, %g2, %g1; \
+	sethi	%hi(BRANCH_ALWAYS), %g3; \
+	sll	%g1, 11, %g1; \
+	srl	%g1, 11 + 2, %g1; \
+	or	%g3, %lo(BRANCH_ALWAYS), %g3; \
+	or	%g3, %g1, %g3; \
+	stw	%g3, [%g2]; \
+	sethi	%hi(NOP), %g3; \
+	or	%g3, %lo(NOP), %g3; \
+	stw	%g3, [%g2 + 0x4]; \
+	flush	%g2;
+
+	.globl	sun4v_patch_tlb_handlers
+	.type	sun4v_patch_tlb_handlers,#function
+sun4v_patch_tlb_handlers:
+	SUN4V_DO_PATCH(tl0_iamiss, sun4v_itlb_miss)
+	SUN4V_DO_PATCH(tl1_iamiss, sun4v_itlb_miss)
+	SUN4V_DO_PATCH(tl0_damiss, sun4v_dtlb_miss)
+	SUN4V_DO_PATCH(tl1_damiss, sun4v_dtlb_miss)
+	SUN4V_DO_PATCH(tl0_daprot, sun4v_dtlb_prot)
+	SUN4V_DO_PATCH(tl1_daprot, sun4v_dtlb_prot)
+	SUN4V_DO_PATCH(tl0_iax, sun4v_iacc)
+	SUN4V_DO_PATCH(tl1_iax, sun4v_iacc_tl1)
+	SUN4V_DO_PATCH(tl0_dax, sun4v_dacc)
+	SUN4V_DO_PATCH(tl1_dax, sun4v_dacc_tl1)
+	SUN4V_DO_PATCH(tl0_mna, sun4v_mna)
+	SUN4V_DO_PATCH(tl1_mna, sun4v_mna)
+	SUN4V_DO_PATCH(tl0_lddfmna, sun4v_lddfmna)
+	SUN4V_DO_PATCH(tl0_stdfmna, sun4v_stdfmna)
+	SUN4V_DO_PATCH(tl0_privact, sun4v_privact)
+	retl
+	 nop
+	.size	sun4v_patch_tlb_handlers,.-sun4v_patch_tlb_handlers

+ 263 - 34
arch/sparc64/kernel/sys_sparc.c

@@ -25,25 +25,93 @@
 #include <linux/syscalls.h>
 #include <linux/syscalls.h>
 #include <linux/ipc.h>
 #include <linux/ipc.h>
 #include <linux/personality.h>
 #include <linux/personality.h>
+#include <linux/random.h>
 
 
 #include <asm/uaccess.h>
 #include <asm/uaccess.h>
 #include <asm/ipc.h>
 #include <asm/ipc.h>
 #include <asm/utrap.h>
 #include <asm/utrap.h>
 #include <asm/perfctr.h>
 #include <asm/perfctr.h>
+#include <asm/a.out.h>
 
 
 /* #define DEBUG_UNIMP_SYSCALL */
 /* #define DEBUG_UNIMP_SYSCALL */
 
 
-/* XXX Make this per-binary type, this way we can detect the type of
- * XXX a binary.  Every Sparc executable calls this very early on.
- */
 asmlinkage unsigned long sys_getpagesize(void)
 asmlinkage unsigned long sys_getpagesize(void)
 {
 {
 	return PAGE_SIZE;
 	return PAGE_SIZE;
 }
 }
 
 
-#define COLOUR_ALIGN(addr,pgoff)		\
-	((((addr)+SHMLBA-1)&~(SHMLBA-1)) +	\
-	 (((pgoff)<<PAGE_SHIFT) & (SHMLBA-1)))
+#define VA_EXCLUDE_START (0x0000080000000000UL - (1UL << 32UL))
+#define VA_EXCLUDE_END   (0xfffff80000000000UL + (1UL << 32UL))
+
+/* Does addr --> addr+len fall within 4GB of the VA-space hole or
+ * overflow past the end of the 64-bit address space?
+ */
+static inline int invalid_64bit_range(unsigned long addr, unsigned long len)
+{
+	unsigned long va_exclude_start, va_exclude_end;
+
+	va_exclude_start = VA_EXCLUDE_START;
+	va_exclude_end   = VA_EXCLUDE_END;
+
+	if (unlikely(len >= va_exclude_start))
+		return 1;
+
+	if (unlikely((addr + len) < addr))
+		return 1;
+
+	if (unlikely((addr >= va_exclude_start && addr < va_exclude_end) ||
+		     ((addr + len) >= va_exclude_start &&
+		      (addr + len) < va_exclude_end)))
+		return 1;
+
+	return 0;
+}
+
+/* Does start,end straddle the VA-space hole?  */
+static inline int straddles_64bit_va_hole(unsigned long start, unsigned long end)
+{
+	unsigned long va_exclude_start, va_exclude_end;
+
+	va_exclude_start = VA_EXCLUDE_START;
+	va_exclude_end   = VA_EXCLUDE_END;
+
+	if (likely(start < va_exclude_start && end < va_exclude_start))
+		return 0;
+
+	if (likely(start >= va_exclude_end && end >= va_exclude_end))
+		return 0;
+
+	return 1;
+}
+
+/* These functions differ from the default implementations in
+ * mm/mmap.c in two ways:
+ *
+ * 1) For file backed MAP_SHARED mmap()'s we D-cache color align,
+ *    for fixed such mappings we just validate what the user gave us.
+ * 2) For 64-bit tasks we avoid mapping anything within 4GB of
+ *    the spitfire/niagara VA-hole.
+ */
+
+static inline unsigned long COLOUR_ALIGN(unsigned long addr,
+					 unsigned long pgoff)
+{
+	unsigned long base = (addr+SHMLBA-1)&~(SHMLBA-1);
+	unsigned long off = (pgoff<<PAGE_SHIFT) & (SHMLBA-1);
+
+	return base + off;
+}
+
+static inline unsigned long COLOUR_ALIGN_DOWN(unsigned long addr,
+					      unsigned long pgoff)
+{
+	unsigned long base = addr & ~(SHMLBA-1);
+	unsigned long off = (pgoff<<PAGE_SHIFT) & (SHMLBA-1);
+
+	if (base + off <= addr)
+		return base + off;
+	return base - off;
+}
 
 
 unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags)
 unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags)
 {
 {
@@ -64,8 +132,8 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi
 	}
 	}
 
 
 	if (test_thread_flag(TIF_32BIT))
 	if (test_thread_flag(TIF_32BIT))
-		task_size = 0xf0000000UL;
-	if (len > task_size || len > -PAGE_OFFSET)
+		task_size = STACK_TOP32;
+	if (unlikely(len > task_size || len >= VA_EXCLUDE_START))
 		return -ENOMEM;
 		return -ENOMEM;
 
 
 	do_color_align = 0;
 	do_color_align = 0;
@@ -84,11 +152,12 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi
 			return addr;
 			return addr;
 	}
 	}
 
 
-	if (len <= mm->cached_hole_size) {
+	if (len > mm->cached_hole_size) {
+	        start_addr = addr = mm->free_area_cache;
+	} else {
+	        start_addr = addr = TASK_UNMAPPED_BASE;
 	        mm->cached_hole_size = 0;
 	        mm->cached_hole_size = 0;
-		mm->free_area_cache = TASK_UNMAPPED_BASE;
 	}
 	}
-	start_addr = addr = mm->free_area_cache;
 
 
 	task_size -= len;
 	task_size -= len;
 
 
@@ -100,11 +169,12 @@ full_search:
 
 
 	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
 	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
 		/* At this point:  (!vma || addr < vma->vm_end). */
 		/* At this point:  (!vma || addr < vma->vm_end). */
-		if (addr < PAGE_OFFSET && -PAGE_OFFSET - len < addr) {
-			addr = PAGE_OFFSET;
-			vma = find_vma(mm, PAGE_OFFSET);
+		if (addr < VA_EXCLUDE_START &&
+		    (addr + len) >= VA_EXCLUDE_START) {
+			addr = VA_EXCLUDE_END;
+			vma = find_vma(mm, VA_EXCLUDE_END);
 		}
 		}
-		if (task_size < addr) {
+		if (unlikely(task_size < addr)) {
 			if (start_addr != TASK_UNMAPPED_BASE) {
 			if (start_addr != TASK_UNMAPPED_BASE) {
 				start_addr = addr = TASK_UNMAPPED_BASE;
 				start_addr = addr = TASK_UNMAPPED_BASE;
 				mm->cached_hole_size = 0;
 				mm->cached_hole_size = 0;
@@ -112,7 +182,7 @@ full_search:
 			}
 			}
 			return -ENOMEM;
 			return -ENOMEM;
 		}
 		}
-		if (!vma || addr + len <= vma->vm_start) {
+		if (likely(!vma || addr + len <= vma->vm_start)) {
 			/*
 			/*
 			 * Remember the place where we stopped the search:
 			 * Remember the place where we stopped the search:
 			 */
 			 */
@@ -128,6 +198,121 @@ full_search:
 	}
 	}
 }
 }
 
 
+unsigned long
+arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
+			  const unsigned long len, const unsigned long pgoff,
+			  const unsigned long flags)
+{
+	struct vm_area_struct *vma;
+	struct mm_struct *mm = current->mm;
+	unsigned long task_size = STACK_TOP32;
+	unsigned long addr = addr0;
+	int do_color_align;
+
+	/* This should only ever run for 32-bit processes.  */
+	BUG_ON(!test_thread_flag(TIF_32BIT));
+
+	if (flags & MAP_FIXED) {
+		/* We do not accept a shared mapping if it would violate
+		 * cache aliasing constraints.
+		 */
+		if ((flags & MAP_SHARED) &&
+		    ((addr - (pgoff << PAGE_SHIFT)) & (SHMLBA - 1)))
+			return -EINVAL;
+		return addr;
+	}
+
+	if (unlikely(len > task_size))
+		return -ENOMEM;
+
+	do_color_align = 0;
+	if (filp || (flags & MAP_SHARED))
+		do_color_align = 1;
+
+	/* requesting a specific address */
+	if (addr) {
+		if (do_color_align)
+			addr = COLOUR_ALIGN(addr, pgoff);
+		else
+			addr = PAGE_ALIGN(addr);
+
+		vma = find_vma(mm, addr);
+		if (task_size - len >= addr &&
+		    (!vma || addr + len <= vma->vm_start))
+			return addr;
+	}
+
+	/* check if free_area_cache is useful for us */
+	if (len <= mm->cached_hole_size) {
+ 	        mm->cached_hole_size = 0;
+ 		mm->free_area_cache = mm->mmap_base;
+ 	}
+
+	/* either no address requested or can't fit in requested address hole */
+	addr = mm->free_area_cache;
+	if (do_color_align) {
+		unsigned long base = COLOUR_ALIGN_DOWN(addr-len, pgoff);
+
+		addr = base + len;
+	}
+
+	/* make sure it can fit in the remaining address space */
+	if (likely(addr > len)) {
+		vma = find_vma(mm, addr-len);
+		if (!vma || addr <= vma->vm_start) {
+			/* remember the address as a hint for next time */
+			return (mm->free_area_cache = addr-len);
+		}
+	}
+
+	if (unlikely(mm->mmap_base < len))
+		goto bottomup;
+
+	addr = mm->mmap_base-len;
+	if (do_color_align)
+		addr = COLOUR_ALIGN_DOWN(addr, pgoff);
+
+	do {
+		/*
+		 * Lookup failure means no vma is above this address,
+		 * else if new region fits below vma->vm_start,
+		 * return with success:
+		 */
+		vma = find_vma(mm, addr);
+		if (likely(!vma || addr+len <= vma->vm_start)) {
+			/* remember the address as a hint for next time */
+			return (mm->free_area_cache = addr);
+		}
+
+ 		/* remember the largest hole we saw so far */
+ 		if (addr + mm->cached_hole_size < vma->vm_start)
+ 		        mm->cached_hole_size = vma->vm_start - addr;
+
+		/* try just below the current vma->vm_start */
+		addr = vma->vm_start-len;
+		if (do_color_align)
+			addr = COLOUR_ALIGN_DOWN(addr, pgoff);
+	} while (likely(len < vma->vm_start));
+
+bottomup:
+	/*
+	 * A failed mmap() very likely causes application failure,
+	 * so fall back to the bottom-up function here. This scenario
+	 * can happen with large stack limits and large mmap()
+	 * allocations.
+	 */
+	mm->cached_hole_size = ~0UL;
+  	mm->free_area_cache = TASK_UNMAPPED_BASE;
+	addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
+	/*
+	 * Restore the topdown base:
+	 */
+	mm->free_area_cache = mm->mmap_base;
+	mm->cached_hole_size = ~0UL;
+
+	return addr;
+}
+
 /* Try to align mapping such that we align it as much as possible. */
 /* Try to align mapping such that we align it as much as possible. */
 unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags)
 unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags)
 {
 {
@@ -171,15 +356,57 @@ unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, u
 	return addr;
 	return addr;
 }
 }
 
 
+/* Essentially the same as PowerPC... */
+void arch_pick_mmap_layout(struct mm_struct *mm)
+{
+	unsigned long random_factor = 0UL;
+
+	if (current->flags & PF_RANDOMIZE) {
+		random_factor = get_random_int();
+		if (test_thread_flag(TIF_32BIT))
+			random_factor &= ((1 * 1024 * 1024) - 1);
+		else
+			random_factor = ((random_factor << PAGE_SHIFT) &
+					 0xffffffffUL);
+	}
+
+	/*
+	 * Fall back to the standard layout if the personality
+	 * bit is set, or if the expected stack growth is unlimited:
+	 */
+	if (!test_thread_flag(TIF_32BIT) ||
+	    (current->personality & ADDR_COMPAT_LAYOUT) ||
+	    current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY ||
+	    sysctl_legacy_va_layout) {
+		mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
+		mm->get_unmapped_area = arch_get_unmapped_area;
+		mm->unmap_area = arch_unmap_area;
+	} else {
+		/* We know it's 32-bit */
+		unsigned long task_size = STACK_TOP32;
+		unsigned long gap;
+
+		gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
+		if (gap < 128 * 1024 * 1024)
+			gap = 128 * 1024 * 1024;
+		if (gap > (task_size / 6 * 5))
+			gap = (task_size / 6 * 5);
+
+		mm->mmap_base = PAGE_ALIGN(task_size - gap - random_factor);
+		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+		mm->unmap_area = arch_unmap_area_topdown;
+	}
+}
+
 asmlinkage unsigned long sparc_brk(unsigned long brk)
 asmlinkage unsigned long sparc_brk(unsigned long brk)
 {
 {
 	/* People could try to be nasty and use ta 0x6d in 32bit programs */
 	/* People could try to be nasty and use ta 0x6d in 32bit programs */
-	if (test_thread_flag(TIF_32BIT) &&
-	    brk >= 0xf0000000UL)
+	if (test_thread_flag(TIF_32BIT) && brk >= STACK_TOP32)
 		return current->mm->brk;
 		return current->mm->brk;
 
 
-	if ((current->mm->brk & PAGE_OFFSET) != (brk & PAGE_OFFSET))
+	if (unlikely(straddles_64bit_va_hole(current->mm->brk, brk)))
 		return current->mm->brk;
 		return current->mm->brk;
+
 	return sys_brk(brk);
 	return sys_brk(brk);
 }
 }
                                                                 
                                                                 
@@ -340,13 +567,16 @@ asmlinkage unsigned long sys_mmap(unsigned long addr, unsigned long len,
 	retval = -EINVAL;
 	retval = -EINVAL;
 
 
 	if (test_thread_flag(TIF_32BIT)) {
 	if (test_thread_flag(TIF_32BIT)) {
-		if (len > 0xf0000000UL ||
-		    ((flags & MAP_FIXED) && addr > 0xf0000000UL - len))
+		if (len >= STACK_TOP32)
+			goto out_putf;
+
+		if ((flags & MAP_FIXED) && addr > STACK_TOP32 - len)
 			goto out_putf;
 			goto out_putf;
 	} else {
 	} else {
-		if (len > -PAGE_OFFSET ||
-		    ((flags & MAP_FIXED) &&
-		     addr < PAGE_OFFSET && addr + len > -PAGE_OFFSET))
+		if (len >= VA_EXCLUDE_START)
+			goto out_putf;
+
+		if ((flags & MAP_FIXED) && invalid_64bit_range(addr, len))
 			goto out_putf;
 			goto out_putf;
 	}
 	}
 
 
@@ -365,9 +595,9 @@ asmlinkage long sys64_munmap(unsigned long addr, size_t len)
 {
 {
 	long ret;
 	long ret;
 
 
-	if (len > -PAGE_OFFSET ||
-	    (addr < PAGE_OFFSET && addr + len > -PAGE_OFFSET))
+	if (invalid_64bit_range(addr, len))
 		return -EINVAL;
 		return -EINVAL;
+
 	down_write(&current->mm->mmap_sem);
 	down_write(&current->mm->mmap_sem);
 	ret = do_munmap(current->mm, addr, len);
 	ret = do_munmap(current->mm, addr, len);
 	up_write(&current->mm->mmap_sem);
 	up_write(&current->mm->mmap_sem);
@@ -384,18 +614,19 @@ asmlinkage unsigned long sys64_mremap(unsigned long addr,
 {
 {
 	struct vm_area_struct *vma;
 	struct vm_area_struct *vma;
 	unsigned long ret = -EINVAL;
 	unsigned long ret = -EINVAL;
+
 	if (test_thread_flag(TIF_32BIT))
 	if (test_thread_flag(TIF_32BIT))
 		goto out;
 		goto out;
-	if (old_len > -PAGE_OFFSET || new_len > -PAGE_OFFSET)
+	if (unlikely(new_len >= VA_EXCLUDE_START))
 		goto out;
 		goto out;
-	if (addr < PAGE_OFFSET && addr + old_len > -PAGE_OFFSET)
+	if (unlikely(invalid_64bit_range(addr, old_len)))
 		goto out;
 		goto out;
+
 	down_write(&current->mm->mmap_sem);
 	down_write(&current->mm->mmap_sem);
 	if (flags & MREMAP_FIXED) {
 	if (flags & MREMAP_FIXED) {
-		if (new_addr < PAGE_OFFSET &&
-		    new_addr + new_len > -PAGE_OFFSET)
+		if (invalid_64bit_range(new_addr, new_len))
 			goto out_sem;
 			goto out_sem;
-	} else if (addr < PAGE_OFFSET && addr + new_len > -PAGE_OFFSET) {
+	} else if (invalid_64bit_range(addr, new_len)) {
 		unsigned long map_flags = 0;
 		unsigned long map_flags = 0;
 		struct file *file = NULL;
 		struct file *file = NULL;
 
 
@@ -554,12 +785,10 @@ asmlinkage long sys_utrap_install(utrap_entry_t type,
 	}
 	}
 	if (!current_thread_info()->utraps) {
 	if (!current_thread_info()->utraps) {
 		current_thread_info()->utraps =
 		current_thread_info()->utraps =
-			kmalloc((UT_TRAP_INSTRUCTION_31+1)*sizeof(long), GFP_KERNEL);
+			kzalloc((UT_TRAP_INSTRUCTION_31+1)*sizeof(long), GFP_KERNEL);
 		if (!current_thread_info()->utraps)
 		if (!current_thread_info()->utraps)
 			return -ENOMEM;
 			return -ENOMEM;
 		current_thread_info()->utraps[0] = 1;
 		current_thread_info()->utraps[0] = 1;
-		memset(current_thread_info()->utraps+1, 0,
-		       UT_TRAP_INSTRUCTION_31*sizeof(long));
 	} else {
 	} else {
 		if ((utrap_handler_t)current_thread_info()->utraps[type] != new_p &&
 		if ((utrap_handler_t)current_thread_info()->utraps[type] != new_p &&
 		    current_thread_info()->utraps[0] > 1) {
 		    current_thread_info()->utraps[0] > 1) {

+ 5 - 4
arch/sparc64/kernel/sys_sparc32.c

@@ -62,6 +62,7 @@
 #include <asm/fpumacro.h>
 #include <asm/fpumacro.h>
 #include <asm/semaphore.h>
 #include <asm/semaphore.h>
 #include <asm/mmu_context.h>
 #include <asm/mmu_context.h>
+#include <asm/a.out.h>
 
 
 asmlinkage long sys32_chown16(const char __user * filename, u16 user, u16 group)
 asmlinkage long sys32_chown16(const char __user * filename, u16 user, u16 group)
 {
 {
@@ -1039,15 +1040,15 @@ asmlinkage unsigned long sys32_mremap(unsigned long addr,
 	unsigned long ret = -EINVAL;
 	unsigned long ret = -EINVAL;
 	unsigned long new_addr = __new_addr;
 	unsigned long new_addr = __new_addr;
 
 
-	if (old_len > 0xf0000000UL || new_len > 0xf0000000UL)
+	if (old_len > STACK_TOP32 || new_len > STACK_TOP32)
 		goto out;
 		goto out;
-	if (addr > 0xf0000000UL - old_len)
+	if (addr > STACK_TOP32 - old_len)
 		goto out;
 		goto out;
 	down_write(&current->mm->mmap_sem);
 	down_write(&current->mm->mmap_sem);
 	if (flags & MREMAP_FIXED) {
 	if (flags & MREMAP_FIXED) {
-		if (new_addr > 0xf0000000UL - new_len)
+		if (new_addr > STACK_TOP32 - new_len)
 			goto out_sem;
 			goto out_sem;
-	} else if (addr > 0xf0000000UL - new_len) {
+	} else if (addr > STACK_TOP32 - new_len) {
 		unsigned long map_flags = 0;
 		unsigned long map_flags = 0;
 		struct file *file = NULL;
 		struct file *file = NULL;
 
 

+ 348 - 25
arch/sparc64/kernel/time.c

@@ -30,6 +30,8 @@
 #include <linux/cpufreq.h>
 #include <linux/cpufreq.h>
 #include <linux/percpu.h>
 #include <linux/percpu.h>
 #include <linux/profile.h>
 #include <linux/profile.h>
+#include <linux/miscdevice.h>
+#include <linux/rtc.h>
 
 
 #include <asm/oplib.h>
 #include <asm/oplib.h>
 #include <asm/mostek.h>
 #include <asm/mostek.h>
@@ -45,6 +47,7 @@
 #include <asm/smp.h>
 #include <asm/smp.h>
 #include <asm/sections.h>
 #include <asm/sections.h>
 #include <asm/cpudata.h>
 #include <asm/cpudata.h>
+#include <asm/uaccess.h>
 
 
 DEFINE_SPINLOCK(mostek_lock);
 DEFINE_SPINLOCK(mostek_lock);
 DEFINE_SPINLOCK(rtc_lock);
 DEFINE_SPINLOCK(rtc_lock);
@@ -193,16 +196,22 @@ struct sparc64_tick_ops *tick_ops __read_mostly = &tick_operations;
 
 
 static void stick_init_tick(unsigned long offset)
 static void stick_init_tick(unsigned long offset)
 {
 {
-	tick_disable_protection();
-
-	/* Let the user get at STICK too. */
-	__asm__ __volatile__(
-	"	rd	%%asr24, %%g2\n"
-	"	andn	%%g2, %0, %%g2\n"
-	"	wr	%%g2, 0, %%asr24"
-	: /* no outputs */
-	: "r" (TICK_PRIV_BIT)
-	: "g1", "g2");
+	/* Writes to the %tick and %stick register are not
+	 * allowed on sun4v.  The Hypervisor controls that
+	 * bit, per-strand.
+	 */
+	if (tlb_type != hypervisor) {
+		tick_disable_protection();
+
+		/* Let the user get at STICK too. */
+		__asm__ __volatile__(
+		"	rd	%%asr24, %%g2\n"
+		"	andn	%%g2, %0, %%g2\n"
+		"	wr	%%g2, 0, %%asr24"
+		: /* no outputs */
+		: "r" (TICK_PRIV_BIT)
+		: "g1", "g2");
+	}
 
 
 	__asm__ __volatile__(
 	__asm__ __volatile__(
 	"	rd	%%asr24, %%g1\n"
 	"	rd	%%asr24, %%g1\n"
@@ -683,6 +692,83 @@ static void __init set_system_time(void)
 	}
 	}
 }
 }
 
 
+/* davem suggests we keep this within the 4M locked kernel image */
+static u32 starfire_get_time(void)
+{
+	static char obp_gettod[32];
+	static u32 unix_tod;
+
+	sprintf(obp_gettod, "h# %08x unix-gettod",
+		(unsigned int) (long) &unix_tod);
+	prom_feval(obp_gettod);
+
+	return unix_tod;
+}
+
+static int starfire_set_time(u32 val)
+{
+	/* Do nothing, time is set using the service processor
+	 * console on this platform.
+	 */
+	return 0;
+}
+
+static u32 hypervisor_get_time(void)
+{
+	register unsigned long func asm("%o5");
+	register unsigned long arg0 asm("%o0");
+	register unsigned long arg1 asm("%o1");
+	int retries = 10000;
+
+retry:
+	func = HV_FAST_TOD_GET;
+	arg0 = 0;
+	arg1 = 0;
+	__asm__ __volatile__("ta	%6"
+			     : "=&r" (func), "=&r" (arg0), "=&r" (arg1)
+			     : "0" (func), "1" (arg0), "2" (arg1),
+			       "i" (HV_FAST_TRAP));
+	if (arg0 == HV_EOK)
+		return arg1;
+	if (arg0 == HV_EWOULDBLOCK) {
+		if (--retries > 0) {
+			udelay(100);
+			goto retry;
+		}
+		printk(KERN_WARNING "SUN4V: tod_get() timed out.\n");
+		return 0;
+	}
+	printk(KERN_WARNING "SUN4V: tod_get() not supported.\n");
+	return 0;
+}
+
+static int hypervisor_set_time(u32 secs)
+{
+	register unsigned long func asm("%o5");
+	register unsigned long arg0 asm("%o0");
+	int retries = 10000;
+
+retry:
+	func = HV_FAST_TOD_SET;
+	arg0 = secs;
+	__asm__ __volatile__("ta	%4"
+			     : "=&r" (func), "=&r" (arg0)
+			     : "0" (func), "1" (arg0),
+			       "i" (HV_FAST_TRAP));
+	if (arg0 == HV_EOK)
+		return 0;
+	if (arg0 == HV_EWOULDBLOCK) {
+		if (--retries > 0) {
+			udelay(100);
+			goto retry;
+		}
+		printk(KERN_WARNING "SUN4V: tod_set() timed out.\n");
+		return -EAGAIN;
+	}
+	printk(KERN_WARNING "SUN4V: tod_set() not supported.\n");
+	return -EOPNOTSUPP;
+}
+
 void __init clock_probe(void)
 void __init clock_probe(void)
 {
 {
 	struct linux_prom_registers clk_reg[2];
 	struct linux_prom_registers clk_reg[2];
@@ -702,14 +788,14 @@ void __init clock_probe(void)
 
 
 
 
 	if (this_is_starfire) {
 	if (this_is_starfire) {
-		/* davem suggests we keep this within the 4M locked kernel image */
-		static char obp_gettod[256];
-		static u32 unix_tod;
-
-		sprintf(obp_gettod, "h# %08x unix-gettod",
-			(unsigned int) (long) &unix_tod);
-		prom_feval(obp_gettod);
-		xtime.tv_sec = unix_tod;
+		xtime.tv_sec = starfire_get_time();
+		xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
+		set_normalized_timespec(&wall_to_monotonic,
+		                        -xtime.tv_sec, -xtime.tv_nsec);
+		return;
+	}
+	if (tlb_type == hypervisor) {
+		xtime.tv_sec = hypervisor_get_time();
 		xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
 		xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
 		set_normalized_timespec(&wall_to_monotonic,
 		set_normalized_timespec(&wall_to_monotonic,
 		                        -xtime.tv_sec, -xtime.tv_nsec);
 		                        -xtime.tv_sec, -xtime.tv_nsec);
@@ -981,11 +1067,10 @@ static void sparc64_start_timers(irqreturn_t (*cfunc)(int, void *, struct pt_reg
 }
 }
 
 
 struct freq_table {
 struct freq_table {
-	unsigned long udelay_val_ref;
 	unsigned long clock_tick_ref;
 	unsigned long clock_tick_ref;
 	unsigned int ref_freq;
 	unsigned int ref_freq;
 };
 };
-static DEFINE_PER_CPU(struct freq_table, sparc64_freq_table) = { 0, 0, 0 };
+static DEFINE_PER_CPU(struct freq_table, sparc64_freq_table) = { 0, 0 };
 
 
 unsigned long sparc64_get_clock_tick(unsigned int cpu)
 unsigned long sparc64_get_clock_tick(unsigned int cpu)
 {
 {
@@ -1007,16 +1092,11 @@ static int sparc64_cpufreq_notifier(struct notifier_block *nb, unsigned long val
 
 
 	if (!ft->ref_freq) {
 	if (!ft->ref_freq) {
 		ft->ref_freq = freq->old;
 		ft->ref_freq = freq->old;
-		ft->udelay_val_ref = cpu_data(cpu).udelay_val;
 		ft->clock_tick_ref = cpu_data(cpu).clock_tick;
 		ft->clock_tick_ref = cpu_data(cpu).clock_tick;
 	}
 	}
 	if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
 	if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
 	    (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
 	    (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
 	    (val == CPUFREQ_RESUMECHANGE)) {
 	    (val == CPUFREQ_RESUMECHANGE)) {
-		cpu_data(cpu).udelay_val =
-			cpufreq_scale(ft->udelay_val_ref,
-				      ft->ref_freq,
-				      freq->new);
 		cpu_data(cpu).clock_tick =
 		cpu_data(cpu).clock_tick =
 			cpufreq_scale(ft->clock_tick_ref,
 			cpufreq_scale(ft->clock_tick_ref,
 				      ft->ref_freq,
 				      ft->ref_freq,
@@ -1179,3 +1259,246 @@ static int set_rtc_mmss(unsigned long nowtime)
 		return retval;
 		return retval;
 	}
 	}
 }
 }
+
+#define RTC_IS_OPEN		0x01	/* means /dev/rtc is in use	*/
+static unsigned char mini_rtc_status;	/* bitmapped status byte.	*/
+
+/* months start at 0 now */
+static unsigned char days_in_mo[] =
+{31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31};
+
+#define FEBRUARY	2
+#define	STARTOFTIME	1970
+#define SECDAY		86400L
+#define SECYR		(SECDAY * 365)
+#define	leapyear(year)		((year) % 4 == 0 && \
+				 ((year) % 100 != 0 || (year) % 400 == 0))
+#define	days_in_year(a) 	(leapyear(a) ? 366 : 365)
+#define	days_in_month(a) 	(month_days[(a) - 1])
+
+static int month_days[12] = {
+	31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31
+};
+
+/*
+ * This only works for the Gregorian calendar - i.e. after 1752 (in the UK)
+ */
+static void GregorianDay(struct rtc_time * tm)
+{
+	int leapsToDate;
+	int lastYear;
+	int day;
+	int MonthOffset[] = { 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334 };
+
+	lastYear = tm->tm_year - 1;
+
+	/*
+	 * Number of leap corrections to apply up to end of last year
+	 */
+	leapsToDate = lastYear / 4 - lastYear / 100 + lastYear / 400;
+
+	/*
+	 * This year is a leap year if it is divisible by 4 except when it is
+	 * divisible by 100 unless it is divisible by 400
+	 *
+	 * e.g. 1904 was a leap year, 1900 was not, 1996 is, and 2000 was
+	 */
+	day = tm->tm_mon > 2 && leapyear(tm->tm_year);
+
+	day += lastYear*365 + leapsToDate + MonthOffset[tm->tm_mon-1] +
+		   tm->tm_mday;
+
+	tm->tm_wday = day % 7;
+}
+
+static void to_tm(int tim, struct rtc_time *tm)
+{
+	register int    i;
+	register long   hms, day;
+
+	day = tim / SECDAY;
+	hms = tim % SECDAY;
+
+	/* Hours, minutes, seconds are easy */
+	tm->tm_hour = hms / 3600;
+	tm->tm_min = (hms % 3600) / 60;
+	tm->tm_sec = (hms % 3600) % 60;
+
+	/* Number of years in days */
+	for (i = STARTOFTIME; day >= days_in_year(i); i++)
+		day -= days_in_year(i);
+	tm->tm_year = i;
+
+	/* Number of months in days left */
+	if (leapyear(tm->tm_year))
+		days_in_month(FEBRUARY) = 29;
+	for (i = 1; day >= days_in_month(i); i++)
+		day -= days_in_month(i);
+	days_in_month(FEBRUARY) = 28;
+	tm->tm_mon = i;
+
+	/* Days are what is left over (+1) from all that. */
+	tm->tm_mday = day + 1;
+
+	/*
+	 * Determine the day of week
+	 */
+	GregorianDay(tm);
+}
+
+/* Both Starfire and SUN4V give us seconds since Jan 1st, 1970,
+ * aka Unix time.  So we have to convert to/from rtc_time.
+ */
+static inline void mini_get_rtc_time(struct rtc_time *time)
+{
+	unsigned long flags;
+	u32 seconds;
+
+	spin_lock_irqsave(&rtc_lock, flags);
+	seconds = 0;
+	if (this_is_starfire)
+		seconds = starfire_get_time();
+	else if (tlb_type == hypervisor)
+		seconds = hypervisor_get_time();
+	spin_unlock_irqrestore(&rtc_lock, flags);
+
+	to_tm(seconds, time);
+	time->tm_year -= 1900;
+	time->tm_mon -= 1;
+}
+
+static inline int mini_set_rtc_time(struct rtc_time *time)
+{
+	u32 seconds = mktime(time->tm_year + 1900, time->tm_mon + 1,
+			     time->tm_mday, time->tm_hour,
+			     time->tm_min, time->tm_sec);
+	unsigned long flags;
+	int err;
+
+	spin_lock_irqsave(&rtc_lock, flags);
+	err = -ENODEV;
+	if (this_is_starfire)
+		err = starfire_set_time(seconds);
+	else  if (tlb_type == hypervisor)
+		err = hypervisor_set_time(seconds);
+	spin_unlock_irqrestore(&rtc_lock, flags);
+
+	return err;
+}
+
+static int mini_rtc_ioctl(struct inode *inode, struct file *file,
+			  unsigned int cmd, unsigned long arg)
+{
+	struct rtc_time wtime;
+	void __user *argp = (void __user *)arg;
+
+	switch (cmd) {
+
+	case RTC_PLL_GET:
+		return -EINVAL;
+
+	case RTC_PLL_SET:
+		return -EINVAL;
+
+	case RTC_UIE_OFF:	/* disable ints from RTC updates.	*/
+		return 0;
+
+	case RTC_UIE_ON:	/* enable ints for RTC updates.	*/
+	        return -EINVAL;
+
+	case RTC_RD_TIME:	/* Read the time/date from RTC	*/
+		/* this doesn't get week-day, who cares */
+		memset(&wtime, 0, sizeof(wtime));
+		mini_get_rtc_time(&wtime);
+
+		return copy_to_user(argp, &wtime, sizeof(wtime)) ? -EFAULT : 0;
+
+	case RTC_SET_TIME:	/* Set the RTC */
+	    {
+		int year;
+		unsigned char leap_yr;
+
+		if (!capable(CAP_SYS_TIME))
+			return -EACCES;
+
+		if (copy_from_user(&wtime, argp, sizeof(wtime)))
+			return -EFAULT;
+
+		year = wtime.tm_year + 1900;
+		leap_yr = ((!(year % 4) && (year % 100)) ||
+			   !(year % 400));
+
+		if ((wtime.tm_mon < 0 || wtime.tm_mon > 11) || (wtime.tm_mday < 1))
+			return -EINVAL;
+
+		if (wtime.tm_mday < 0 || wtime.tm_mday >
+		    (days_in_mo[wtime.tm_mon] + ((wtime.tm_mon == 1) && leap_yr)))
+			return -EINVAL;
+
+		if (wtime.tm_hour < 0 || wtime.tm_hour >= 24 ||
+		    wtime.tm_min < 0 || wtime.tm_min >= 60 ||
+		    wtime.tm_sec < 0 || wtime.tm_sec >= 60)
+			return -EINVAL;
+
+		return mini_set_rtc_time(&wtime);
+	    }
+	}
+
+	return -EINVAL;
+}
+
+static int mini_rtc_open(struct inode *inode, struct file *file)
+{
+	if (mini_rtc_status & RTC_IS_OPEN)
+		return -EBUSY;
+
+	mini_rtc_status |= RTC_IS_OPEN;
+
+	return 0;
+}
+
+static int mini_rtc_release(struct inode *inode, struct file *file)
+{
+	mini_rtc_status &= ~RTC_IS_OPEN;
+	return 0;
+}
+
+
+static struct file_operations mini_rtc_fops = {
+	.owner		= THIS_MODULE,
+	.ioctl		= mini_rtc_ioctl,
+	.open		= mini_rtc_open,
+	.release	= mini_rtc_release,
+};
+
+static struct miscdevice rtc_mini_dev =
+{
+	.minor		= RTC_MINOR,
+	.name		= "rtc",
+	.fops		= &mini_rtc_fops,
+};
+
+static int __init rtc_mini_init(void)
+{
+	int retval;
+
+	if (tlb_type != hypervisor && !this_is_starfire)
+		return -ENODEV;
+
+	printk(KERN_INFO "Mini RTC Driver\n");
+
+	retval = misc_register(&rtc_mini_dev);
+	if (retval < 0)
+		return retval;
+
+	return 0;
+}
+
+static void __exit rtc_mini_exit(void)
+{
+	misc_deregister(&rtc_mini_dev);
+}
+
+
+module_init(rtc_mini_init);
+module_exit(rtc_mini_exit);

+ 163 - 75
arch/sparc64/kernel/trampoline.S

@@ -16,6 +16,8 @@
 #include <asm/processor.h>
 #include <asm/processor.h>
 #include <asm/thread_info.h>
 #include <asm/thread_info.h>
 #include <asm/mmu.h>
 #include <asm/mmu.h>
+#include <asm/hypervisor.h>
+#include <asm/cpudata.h>
 
 
 	.data
 	.data
 	.align	8
 	.align	8
@@ -28,14 +30,19 @@ itlb_load:
 dtlb_load:
 dtlb_load:
 	.asciz	"SUNW,dtlb-load"
 	.asciz	"SUNW,dtlb-load"
 
 
+	/* XXX __cpuinit this thing XXX */
+#define TRAMP_STACK_SIZE	1024
+	.align	16
+tramp_stack:
+	.skip	TRAMP_STACK_SIZE
+
 	.text
 	.text
 	.align		8
 	.align		8
 	.globl		sparc64_cpu_startup, sparc64_cpu_startup_end
 	.globl		sparc64_cpu_startup, sparc64_cpu_startup_end
 sparc64_cpu_startup:
 sparc64_cpu_startup:
-	flushw
-
-	BRANCH_IF_CHEETAH_BASE(g1,g5,cheetah_startup)
-	BRANCH_IF_CHEETAH_PLUS_OR_FOLLOWON(g1,g5,cheetah_plus_startup)
+	BRANCH_IF_SUN4V(g1, niagara_startup)
+	BRANCH_IF_CHEETAH_BASE(g1, g5, cheetah_startup)
+	BRANCH_IF_CHEETAH_PLUS_OR_FOLLOWON(g1, g5, cheetah_plus_startup)
 
 
 	ba,pt	%xcc, spitfire_startup
 	ba,pt	%xcc, spitfire_startup
 	 nop
 	 nop
@@ -55,6 +62,7 @@ cheetah_startup:
 	or	%g5, DCU_DM | DCU_IM | DCU_DC | DCU_IC, %g5
 	or	%g5, DCU_DM | DCU_IM | DCU_DC | DCU_IC, %g5
 	stxa	%g5, [%g0] ASI_DCU_CONTROL_REG
 	stxa	%g5, [%g0] ASI_DCU_CONTROL_REG
 	membar	#Sync
 	membar	#Sync
+	/* fallthru */
 
 
 cheetah_generic_startup:
 cheetah_generic_startup:
 	mov	TSB_EXTENSION_P, %g3
 	mov	TSB_EXTENSION_P, %g3
@@ -70,7 +78,9 @@ cheetah_generic_startup:
 	stxa	%g0, [%g3] ASI_DMMU
 	stxa	%g0, [%g3] ASI_DMMU
 	stxa	%g0, [%g3] ASI_IMMU
 	stxa	%g0, [%g3] ASI_IMMU
 	membar	#Sync
 	membar	#Sync
+	/* fallthru */
 
 
+niagara_startup:
 	/* Disable STICK_INT interrupts. */
 	/* Disable STICK_INT interrupts. */
 	sethi		%hi(0x80000000), %g5
 	sethi		%hi(0x80000000), %g5
 	sllx		%g5, 32, %g5
 	sllx		%g5, 32, %g5
@@ -85,17 +95,17 @@ spitfire_startup:
 	membar		#Sync
 	membar		#Sync
 
 
 startup_continue:
 startup_continue:
-	wrpr		%g0, 15, %pil
-
 	sethi		%hi(0x80000000), %g2
 	sethi		%hi(0x80000000), %g2
 	sllx		%g2, 32, %g2
 	sllx		%g2, 32, %g2
 	wr		%g2, 0, %tick_cmpr
 	wr		%g2, 0, %tick_cmpr
 
 
+	mov		%o0, %l0
+
+	BRANCH_IF_SUN4V(g1, niagara_lock_tlb)
+
 	/* Call OBP by hand to lock KERNBASE into i/d tlbs.
 	/* Call OBP by hand to lock KERNBASE into i/d tlbs.
 	 * We lock 2 consequetive entries if we are 'bigkernel'.
 	 * We lock 2 consequetive entries if we are 'bigkernel'.
 	 */
 	 */
-	mov		%o0, %l0
-
 	sethi		%hi(prom_entry_lock), %g2
 	sethi		%hi(prom_entry_lock), %g2
 1:	ldstub		[%g2 + %lo(prom_entry_lock)], %g1
 1:	ldstub		[%g2 + %lo(prom_entry_lock)], %g1
 	membar		#StoreLoad | #StoreStore
 	membar		#StoreLoad | #StoreStore
@@ -105,7 +115,6 @@ startup_continue:
 	sethi		%hi(p1275buf), %g2
 	sethi		%hi(p1275buf), %g2
 	or		%g2, %lo(p1275buf), %g2
 	or		%g2, %lo(p1275buf), %g2
 	ldx		[%g2 + 0x10], %l2
 	ldx		[%g2 + 0x10], %l2
-	mov		%sp, %l1
 	add		%l2, -(192 + 128), %sp
 	add		%l2, -(192 + 128), %sp
 	flushw
 	flushw
 
 
@@ -142,8 +151,7 @@ startup_continue:
 
 
 	sethi		%hi(bigkernel), %g2
 	sethi		%hi(bigkernel), %g2
 	lduw		[%g2 + %lo(bigkernel)], %g2
 	lduw		[%g2 + %lo(bigkernel)], %g2
-	cmp		%g2, 0
-	be,pt		%icc, do_dtlb
+	brz,pt		%g2, do_dtlb
 	 nop
 	 nop
 
 
 	sethi		%hi(call_method), %g2
 	sethi		%hi(call_method), %g2
@@ -214,8 +222,7 @@ do_dtlb:
 
 
 	sethi		%hi(bigkernel), %g2
 	sethi		%hi(bigkernel), %g2
 	lduw		[%g2 + %lo(bigkernel)], %g2
 	lduw		[%g2 + %lo(bigkernel)], %g2
-	cmp		%g2, 0
-	be,pt		%icc, do_unlock
+	brz,pt		%g2, do_unlock
 	 nop
 	 nop
 
 
 	sethi		%hi(call_method), %g2
 	sethi		%hi(call_method), %g2
@@ -257,99 +264,180 @@ do_unlock:
 	stb		%g0, [%g2 + %lo(prom_entry_lock)]
 	stb		%g0, [%g2 + %lo(prom_entry_lock)]
 	membar		#StoreStore | #StoreLoad
 	membar		#StoreStore | #StoreLoad
 
 
-	mov		%l1, %sp
-	flushw
+	ba,pt		%xcc, after_lock_tlb
+	 nop
+
+niagara_lock_tlb:
+	mov		HV_FAST_MMU_MAP_PERM_ADDR, %o5
+	sethi		%hi(KERNBASE), %o0
+	clr		%o1
+	sethi		%hi(kern_locked_tte_data), %o2
+	ldx		[%o2 + %lo(kern_locked_tte_data)], %o2
+	mov		HV_MMU_IMMU, %o3
+	ta		HV_FAST_TRAP
+
+	mov		HV_FAST_MMU_MAP_PERM_ADDR, %o5
+	sethi		%hi(KERNBASE), %o0
+	clr		%o1
+	sethi		%hi(kern_locked_tte_data), %o2
+	ldx		[%o2 + %lo(kern_locked_tte_data)], %o2
+	mov		HV_MMU_DMMU, %o3
+	ta		HV_FAST_TRAP
 
 
-	mov		%l0, %o0
+	sethi		%hi(bigkernel), %g2
+	lduw		[%g2 + %lo(bigkernel)], %g2
+	brz,pt		%g2, after_lock_tlb
+	 nop
 
 
+	mov		HV_FAST_MMU_MAP_PERM_ADDR, %o5
+	sethi		%hi(KERNBASE + 0x400000), %o0
+	clr		%o1
+	sethi		%hi(kern_locked_tte_data), %o2
+	ldx		[%o2 + %lo(kern_locked_tte_data)], %o2
+	sethi		%hi(0x400000), %o3
+	add		%o2, %o3, %o2
+	mov		HV_MMU_IMMU, %o3
+	ta		HV_FAST_TRAP
+
+	mov		HV_FAST_MMU_MAP_PERM_ADDR, %o5
+	sethi		%hi(KERNBASE + 0x400000), %o0
+	clr		%o1
+	sethi		%hi(kern_locked_tte_data), %o2
+	ldx		[%o2 + %lo(kern_locked_tte_data)], %o2
+	sethi		%hi(0x400000), %o3
+	add		%o2, %o3, %o2
+	mov		HV_MMU_DMMU, %o3
+	ta		HV_FAST_TRAP
+
+after_lock_tlb:
 	wrpr		%g0, (PSTATE_PRIV | PSTATE_PEF), %pstate
 	wrpr		%g0, (PSTATE_PRIV | PSTATE_PEF), %pstate
 	wr		%g0, 0, %fprs
 	wr		%g0, 0, %fprs
 
 
-	/* XXX Buggy PROM... */
-	srl		%o0, 0, %o0
-	ldx		[%o0], %g6
-
 	wr		%g0, ASI_P, %asi
 	wr		%g0, ASI_P, %asi
 
 
 	mov		PRIMARY_CONTEXT, %g7
 	mov		PRIMARY_CONTEXT, %g7
-	stxa		%g0, [%g7] ASI_DMMU
+
+661:	stxa		%g0, [%g7] ASI_DMMU
+	.section	.sun4v_1insn_patch, "ax"
+	.word		661b
+	stxa		%g0, [%g7] ASI_MMU
+	.previous
+
 	membar		#Sync
 	membar		#Sync
 	mov		SECONDARY_CONTEXT, %g7
 	mov		SECONDARY_CONTEXT, %g7
-	stxa		%g0, [%g7] ASI_DMMU
+
+661:	stxa		%g0, [%g7] ASI_DMMU
+	.section	.sun4v_1insn_patch, "ax"
+	.word		661b
+	stxa		%g0, [%g7] ASI_MMU
+	.previous
+
 	membar		#Sync
 	membar		#Sync
 
 
-	mov		1, %g5
-	sllx		%g5, THREAD_SHIFT, %g5
-	sub		%g5, (STACKFRAME_SZ + STACK_BIAS), %g5
-	add		%g6, %g5, %sp
+	/* Everything we do here, until we properly take over the
+	 * trap table, must be done with extreme care.  We cannot
+	 * make any references to %g6 (current thread pointer),
+	 * %g4 (current task pointer), or %g5 (base of current cpu's
+	 * per-cpu area) until we properly take over the trap table
+	 * from the firmware and hypervisor.
+	 *
+	 * Get onto temporary stack which is in the locked kernel image.
+	 */
+	sethi		%hi(tramp_stack), %g1
+	or		%g1, %lo(tramp_stack), %g1
+	add		%g1, TRAMP_STACK_SIZE, %g1
+	sub		%g1, STACKFRAME_SZ + STACK_BIAS, %sp
 	mov		0, %fp
 	mov		0, %fp
 
 
-	wrpr		%g0, 0, %wstate
-	wrpr		%g0, 0, %tl
+	/* Put garbage in these registers to trap any access to them.  */
+	set		0xdeadbeef, %g4
+	set		0xdeadbeef, %g5
+	set		0xdeadbeef, %g6
 
 
-	/* Setup the trap globals, then we can resurface. */
-	rdpr		%pstate, %o1
-	mov		%g6, %o2
-	wrpr		%o1, PSTATE_AG, %pstate
-	sethi		%hi(sparc64_ttable_tl0), %g5
-	wrpr		%g5, %tba
-	mov		%o2, %g6
-
-	wrpr		%o1, PSTATE_MG, %pstate
-#define KERN_HIGHBITS		((_PAGE_VALID|_PAGE_SZ4MB)^0xfffff80000000000)
-#define KERN_LOWBITS		(_PAGE_CP | _PAGE_CV | _PAGE_P | _PAGE_W)
-
-	mov		TSB_REG, %g1
-	stxa		%g0, [%g1] ASI_DMMU
-	membar		#Sync
-	mov		TLB_SFSR, %g1
-	sethi		%uhi(KERN_HIGHBITS), %g2
-	or		%g2, %ulo(KERN_HIGHBITS), %g2
-	sllx		%g2, 32, %g2
-	or		%g2, KERN_LOWBITS, %g2
+	call		init_irqwork_curcpu
+	 nop
 
 
-	BRANCH_IF_ANY_CHEETAH(g3,g7,9f)
+	sethi		%hi(tlb_type), %g3
+	lduw		[%g3 + %lo(tlb_type)], %g2
+	cmp		%g2, 3
+	bne,pt		%icc, 1f
+	 nop
 
 
-	ba,pt		%xcc, 1f
+	call		hard_smp_processor_id
 	 nop
 	 nop
+	
+	mov		%o0, %o1
+	mov		0, %o0
+	mov		0, %o2
+	call		sun4v_init_mondo_queues
+	 mov		1, %o3
 
 
-9:
-	sethi		%uhi(VPTE_BASE_CHEETAH), %g3
-	or		%g3, %ulo(VPTE_BASE_CHEETAH), %g3
-	ba,pt		%xcc, 2f
-	 sllx		%g3, 32, %g3
-1:
-	sethi		%uhi(VPTE_BASE_SPITFIRE), %g3
-	or		%g3, %ulo(VPTE_BASE_SPITFIRE), %g3
-	sllx		%g3, 32, %g3
+1:	call		init_cur_cpu_trap
+	 ldx		[%l0], %o0
+
+	/* Start using proper page size encodings in ctx register.  */
+	sethi		%hi(sparc64_kern_pri_context), %g3
+	ldx		[%g3 + %lo(sparc64_kern_pri_context)], %g2
+	mov		PRIMARY_CONTEXT, %g1
 
 
-2:
-	clr	%g7
-#undef KERN_HIGHBITS
-#undef KERN_LOWBITS
+661:	stxa		%g2, [%g1] ASI_DMMU
+	.section	.sun4v_1insn_patch, "ax"
+	.word		661b
+	stxa		%g2, [%g1] ASI_MMU
+	.previous
 
 
-	wrpr		%o1, 0x0, %pstate
-	ldx		[%g6 + TI_TASK], %g4
+	membar		#Sync
 
 
 	wrpr		%g0, 0, %wstate
 	wrpr		%g0, 0, %wstate
 
 
-	call		init_irqwork_curcpu
+	/* As a hack, put &init_thread_union into %g6.
+	 * prom_world() loads from here to restore the %asi
+	 * register.
+	 */
+	sethi		%hi(init_thread_union), %g6
+	or		%g6, %lo(init_thread_union), %g6
+
+	sethi		%hi(is_sun4v), %o0
+	lduw		[%o0 + %lo(is_sun4v)], %o0
+	brz,pt		%o0, 1f
 	 nop
 	 nop
 
 
-	/* Start using proper page size encodings in ctx register.  */
-	sethi	%hi(sparc64_kern_pri_context), %g3
-	ldx	[%g3 + %lo(sparc64_kern_pri_context)], %g2
-	mov	PRIMARY_CONTEXT, %g1
-	stxa	%g2, [%g1] ASI_DMMU
-	membar	#Sync
+	TRAP_LOAD_TRAP_BLOCK(%g2, %g3)
+	add		%g2, TRAP_PER_CPU_FAULT_INFO, %g2
+	stxa		%g2, [%g0] ASI_SCRATCHPAD
+
+	/* Compute physical address:
+	 *
+	 * paddr = kern_base + (mmfsa_vaddr - KERNBASE)
+	 */
+	sethi		%hi(KERNBASE), %g3
+	sub		%g2, %g3, %g2
+	sethi		%hi(kern_base), %g3
+	ldx		[%g3 + %lo(kern_base)], %g3
+	add		%g2, %g3, %o1
+
+	call		prom_set_trap_table_sun4v
+	 sethi		%hi(sparc64_ttable_tl0), %o0
+
+	ba,pt		%xcc, 2f
+	 nop
+
+1:	call		prom_set_trap_table
+	 sethi		%hi(sparc64_ttable_tl0), %o0
+
+2:	ldx		[%l0], %g6
+	ldx		[%g6 + TI_TASK], %g4
+
+	mov		1, %g5
+	sllx		%g5, THREAD_SHIFT, %g5
+	sub		%g5, (STACKFRAME_SZ + STACK_BIAS), %g5
+	add		%g6, %g5, %sp
+	mov		0, %fp
 
 
 	rdpr		%pstate, %o1
 	rdpr		%pstate, %o1
 	or		%o1, PSTATE_IE, %o1
 	or		%o1, PSTATE_IE, %o1
 	wrpr		%o1, 0, %pstate
 	wrpr		%o1, 0, %pstate
 
 
-	call		prom_set_trap_table
-	 sethi		%hi(sparc64_ttable_tl0), %o0
-
 	call		smp_callin
 	call		smp_callin
 	 nop
 	 nop
 	call		cpu_idle
 	call		cpu_idle

+ 388 - 15
arch/sparc64/kernel/traps.c

@@ -38,6 +38,7 @@
 #include <asm/processor.h>
 #include <asm/processor.h>
 #include <asm/timer.h>
 #include <asm/timer.h>
 #include <asm/kdebug.h>
 #include <asm/kdebug.h>
+#include <asm/head.h>
 #ifdef CONFIG_KMOD
 #ifdef CONFIG_KMOD
 #include <linux/kmod.h>
 #include <linux/kmod.h>
 #endif
 #endif
@@ -72,12 +73,14 @@ struct tl1_traplog {
 
 
 static void dump_tl1_traplog(struct tl1_traplog *p)
 static void dump_tl1_traplog(struct tl1_traplog *p)
 {
 {
-	int i;
+	int i, limit;
+
+	printk(KERN_EMERG "TRAPLOG: Error at trap level 0x%lx, "
+	       "dumping track stack.\n", p->tl);
 
 
-	printk("TRAPLOG: Error at trap level 0x%lx, dumping track stack.\n",
-	       p->tl);
-	for (i = 0; i < 4; i++) {
-		printk(KERN_CRIT
+	limit = (tlb_type == hypervisor) ? 2 : 4;
+	for (i = 0; i < limit; i++) {
+		printk(KERN_EMERG
 		       "TRAPLOG: Trap level %d TSTATE[%016lx] TPC[%016lx] "
 		       "TRAPLOG: Trap level %d TSTATE[%016lx] TPC[%016lx] "
 		       "TNPC[%016lx] TT[%lx]\n",
 		       "TNPC[%016lx] TT[%lx]\n",
 		       i + 1,
 		       i + 1,
@@ -179,6 +182,45 @@ void spitfire_insn_access_exception_tl1(struct pt_regs *regs, unsigned long sfsr
 	spitfire_insn_access_exception(regs, sfsr, sfar);
 	spitfire_insn_access_exception(regs, sfsr, sfar);
 }
 }
 
 
+void sun4v_insn_access_exception(struct pt_regs *regs, unsigned long addr, unsigned long type_ctx)
+{
+	unsigned short type = (type_ctx >> 16);
+	unsigned short ctx  = (type_ctx & 0xffff);
+	siginfo_t info;
+
+	if (notify_die(DIE_TRAP, "instruction access exception", regs,
+		       0, 0x8, SIGTRAP) == NOTIFY_STOP)
+		return;
+
+	if (regs->tstate & TSTATE_PRIV) {
+		printk("sun4v_insn_access_exception: ADDR[%016lx] "
+		       "CTX[%04x] TYPE[%04x], going.\n",
+		       addr, ctx, type);
+		die_if_kernel("Iax", regs);
+	}
+
+	if (test_thread_flag(TIF_32BIT)) {
+		regs->tpc &= 0xffffffff;
+		regs->tnpc &= 0xffffffff;
+	}
+	info.si_signo = SIGSEGV;
+	info.si_errno = 0;
+	info.si_code = SEGV_MAPERR;
+	info.si_addr = (void __user *) addr;
+	info.si_trapno = 0;
+	force_sig_info(SIGSEGV, &info, current);
+}
+
+void sun4v_insn_access_exception_tl1(struct pt_regs *regs, unsigned long addr, unsigned long type_ctx)
+{
+	if (notify_die(DIE_TRAP_TL1, "instruction access exception tl1", regs,
+		       0, 0x8, SIGTRAP) == NOTIFY_STOP)
+		return;
+
+	dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
+	sun4v_insn_access_exception(regs, addr, type_ctx);
+}
+
 void spitfire_data_access_exception(struct pt_regs *regs, unsigned long sfsr, unsigned long sfar)
 void spitfire_data_access_exception(struct pt_regs *regs, unsigned long sfsr, unsigned long sfar)
 {
 {
 	siginfo_t info;
 	siginfo_t info;
@@ -227,6 +269,45 @@ void spitfire_data_access_exception_tl1(struct pt_regs *regs, unsigned long sfsr
 	spitfire_data_access_exception(regs, sfsr, sfar);
 	spitfire_data_access_exception(regs, sfsr, sfar);
 }
 }
 
 
+void sun4v_data_access_exception(struct pt_regs *regs, unsigned long addr, unsigned long type_ctx)
+{
+	unsigned short type = (type_ctx >> 16);
+	unsigned short ctx  = (type_ctx & 0xffff);
+	siginfo_t info;
+
+	if (notify_die(DIE_TRAP, "data access exception", regs,
+		       0, 0x8, SIGTRAP) == NOTIFY_STOP)
+		return;
+
+	if (regs->tstate & TSTATE_PRIV) {
+		printk("sun4v_data_access_exception: ADDR[%016lx] "
+		       "CTX[%04x] TYPE[%04x], going.\n",
+		       addr, ctx, type);
+		die_if_kernel("Dax", regs);
+	}
+
+	if (test_thread_flag(TIF_32BIT)) {
+		regs->tpc &= 0xffffffff;
+		regs->tnpc &= 0xffffffff;
+	}
+	info.si_signo = SIGSEGV;
+	info.si_errno = 0;
+	info.si_code = SEGV_MAPERR;
+	info.si_addr = (void __user *) addr;
+	info.si_trapno = 0;
+	force_sig_info(SIGSEGV, &info, current);
+}
+
+void sun4v_data_access_exception_tl1(struct pt_regs *regs, unsigned long addr, unsigned long type_ctx)
+{
+	if (notify_die(DIE_TRAP_TL1, "data access exception tl1", regs,
+		       0, 0x8, SIGTRAP) == NOTIFY_STOP)
+		return;
+
+	dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
+	sun4v_data_access_exception(regs, addr, type_ctx);
+}
+
 #ifdef CONFIG_PCI
 #ifdef CONFIG_PCI
 /* This is really pathetic... */
 /* This is really pathetic... */
 extern volatile int pci_poke_in_progress;
 extern volatile int pci_poke_in_progress;
@@ -788,7 +869,8 @@ void __init cheetah_ecache_flush_init(void)
 		cheetah_error_log[i].afsr = CHAFSR_INVALID;
 		cheetah_error_log[i].afsr = CHAFSR_INVALID;
 
 
 	__asm__ ("rdpr %%ver, %0" : "=r" (ver));
 	__asm__ ("rdpr %%ver, %0" : "=r" (ver));
-	if ((ver >> 32) == 0x003e0016) {
+	if ((ver >> 32) == __JALAPENO_ID ||
+	    (ver >> 32) == __SERRANO_ID) {
 		cheetah_error_table = &__jalapeno_error_table[0];
 		cheetah_error_table = &__jalapeno_error_table[0];
 		cheetah_afsr_errors = JPAFSR_ERRORS;
 		cheetah_afsr_errors = JPAFSR_ERRORS;
 	} else if ((ver >> 32) == 0x003e0015) {
 	} else if ((ver >> 32) == 0x003e0015) {
@@ -1666,6 +1748,238 @@ void cheetah_plus_parity_error(int type, struct pt_regs *regs)
 	       regs->tpc);
 	       regs->tpc);
 }
 }
 
 
+struct sun4v_error_entry {
+	u64		err_handle;
+	u64		err_stick;
+
+	u32		err_type;
+#define SUN4V_ERR_TYPE_UNDEFINED	0
+#define SUN4V_ERR_TYPE_UNCORRECTED_RES	1
+#define SUN4V_ERR_TYPE_PRECISE_NONRES	2
+#define SUN4V_ERR_TYPE_DEFERRED_NONRES	3
+#define SUN4V_ERR_TYPE_WARNING_RES	4
+
+	u32		err_attrs;
+#define SUN4V_ERR_ATTRS_PROCESSOR	0x00000001
+#define SUN4V_ERR_ATTRS_MEMORY		0x00000002
+#define SUN4V_ERR_ATTRS_PIO		0x00000004
+#define SUN4V_ERR_ATTRS_INT_REGISTERS	0x00000008
+#define SUN4V_ERR_ATTRS_FPU_REGISTERS	0x00000010
+#define SUN4V_ERR_ATTRS_USER_MODE	0x01000000
+#define SUN4V_ERR_ATTRS_PRIV_MODE	0x02000000
+#define SUN4V_ERR_ATTRS_RES_QUEUE_FULL	0x80000000
+
+	u64		err_raddr;
+	u32		err_size;
+	u16		err_cpu;
+	u16		err_pad;
+};
+
+static atomic_t sun4v_resum_oflow_cnt = ATOMIC_INIT(0);
+static atomic_t sun4v_nonresum_oflow_cnt = ATOMIC_INIT(0);
+
+static const char *sun4v_err_type_to_str(u32 type)
+{
+	switch (type) {
+	case SUN4V_ERR_TYPE_UNDEFINED:
+		return "undefined";
+	case SUN4V_ERR_TYPE_UNCORRECTED_RES:
+		return "uncorrected resumable";
+	case SUN4V_ERR_TYPE_PRECISE_NONRES:
+		return "precise nonresumable";
+	case SUN4V_ERR_TYPE_DEFERRED_NONRES:
+		return "deferred nonresumable";
+	case SUN4V_ERR_TYPE_WARNING_RES:
+		return "warning resumable";
+	default:
+		return "unknown";
+	};
+}
+
+static void sun4v_log_error(struct sun4v_error_entry *ent, int cpu, const char *pfx, atomic_t *ocnt)
+{
+	int cnt;
+
+	printk("%s: Reporting on cpu %d\n", pfx, cpu);
+	printk("%s: err_handle[%lx] err_stick[%lx] err_type[%08x:%s]\n",
+	       pfx,
+	       ent->err_handle, ent->err_stick,
+	       ent->err_type,
+	       sun4v_err_type_to_str(ent->err_type));
+	printk("%s: err_attrs[%08x:%s %s %s %s %s %s %s %s]\n",
+	       pfx,
+	       ent->err_attrs,
+	       ((ent->err_attrs & SUN4V_ERR_ATTRS_PROCESSOR) ?
+		"processor" : ""),
+	       ((ent->err_attrs & SUN4V_ERR_ATTRS_MEMORY) ?
+		"memory" : ""),
+	       ((ent->err_attrs & SUN4V_ERR_ATTRS_PIO) ?
+		"pio" : ""),
+	       ((ent->err_attrs & SUN4V_ERR_ATTRS_INT_REGISTERS) ?
+		"integer-regs" : ""),
+	       ((ent->err_attrs & SUN4V_ERR_ATTRS_FPU_REGISTERS) ?
+		"fpu-regs" : ""),
+	       ((ent->err_attrs & SUN4V_ERR_ATTRS_USER_MODE) ?
+		"user" : ""),
+	       ((ent->err_attrs & SUN4V_ERR_ATTRS_PRIV_MODE) ?
+		"privileged" : ""),
+	       ((ent->err_attrs & SUN4V_ERR_ATTRS_RES_QUEUE_FULL) ?
+		"queue-full" : ""));
+	printk("%s: err_raddr[%016lx] err_size[%u] err_cpu[%u]\n",
+	       pfx,
+	       ent->err_raddr, ent->err_size, ent->err_cpu);
+
+	if ((cnt = atomic_read(ocnt)) != 0) {
+		atomic_set(ocnt, 0);
+		wmb();
+		printk("%s: Queue overflowed %d times.\n",
+		       pfx, cnt);
+	}
+}
+
+/* We run with %pil set to 15 and PSTATE_IE enabled in %pstate.
+ * Log the event and clear the first word of the entry.
+ */
+void sun4v_resum_error(struct pt_regs *regs, unsigned long offset)
+{
+	struct sun4v_error_entry *ent, local_copy;
+	struct trap_per_cpu *tb;
+	unsigned long paddr;
+	int cpu;
+
+	cpu = get_cpu();
+
+	tb = &trap_block[cpu];
+	paddr = tb->resum_kernel_buf_pa + offset;
+	ent = __va(paddr);
+
+	memcpy(&local_copy, ent, sizeof(struct sun4v_error_entry));
+
+	/* We have a local copy now, so release the entry.  */
+	ent->err_handle = 0;
+	wmb();
+
+	put_cpu();
+
+	sun4v_log_error(&local_copy, cpu,
+			KERN_ERR "RESUMABLE ERROR",
+			&sun4v_resum_oflow_cnt);
+}
+
+/* If we try to printk() we'll probably make matters worse, by trying
+ * to retake locks this cpu already holds or causing more errors. So
+ * just bump a counter, and we'll report these counter bumps above.
+ */
+void sun4v_resum_overflow(struct pt_regs *regs)
+{
+	atomic_inc(&sun4v_resum_oflow_cnt);
+}
+
+/* We run with %pil set to 15 and PSTATE_IE enabled in %pstate.
+ * Log the event, clear the first word of the entry, and die.
+ */
+void sun4v_nonresum_error(struct pt_regs *regs, unsigned long offset)
+{
+	struct sun4v_error_entry *ent, local_copy;
+	struct trap_per_cpu *tb;
+	unsigned long paddr;
+	int cpu;
+
+	cpu = get_cpu();
+
+	tb = &trap_block[cpu];
+	paddr = tb->nonresum_kernel_buf_pa + offset;
+	ent = __va(paddr);
+
+	memcpy(&local_copy, ent, sizeof(struct sun4v_error_entry));
+
+	/* We have a local copy now, so release the entry.  */
+	ent->err_handle = 0;
+	wmb();
+
+	put_cpu();
+
+#ifdef CONFIG_PCI
+	/* Check for the special PCI poke sequence. */
+	if (pci_poke_in_progress && pci_poke_cpu == cpu) {
+		pci_poke_faulted = 1;
+		regs->tpc += 4;
+		regs->tnpc = regs->tpc + 4;
+		return;
+	}
+#endif
+
+	sun4v_log_error(&local_copy, cpu,
+			KERN_EMERG "NON-RESUMABLE ERROR",
+			&sun4v_nonresum_oflow_cnt);
+
+	panic("Non-resumable error.");
+}
+
+/* If we try to printk() we'll probably make matters worse, by trying
+ * to retake locks this cpu already holds or causing more errors. So
+ * just bump a counter, and we'll report these counter bumps above.
+ */
+void sun4v_nonresum_overflow(struct pt_regs *regs)
+{
+	/* XXX Actually even this can make not that much sense.  Perhaps
+	 * XXX we should just pull the plug and panic directly from here?
+	 */
+	atomic_inc(&sun4v_nonresum_oflow_cnt);
+}
+
+unsigned long sun4v_err_itlb_vaddr;
+unsigned long sun4v_err_itlb_ctx;
+unsigned long sun4v_err_itlb_pte;
+unsigned long sun4v_err_itlb_error;
+
+void sun4v_itlb_error_report(struct pt_regs *regs, int tl)
+{
+	if (tl > 1)
+		dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
+
+	printk(KERN_EMERG "SUN4V-ITLB: Error at TPC[%lx], tl %d\n",
+	       regs->tpc, tl);
+	printk(KERN_EMERG "SUN4V-ITLB: vaddr[%lx] ctx[%lx] "
+	       "pte[%lx] error[%lx]\n",
+	       sun4v_err_itlb_vaddr, sun4v_err_itlb_ctx,
+	       sun4v_err_itlb_pte, sun4v_err_itlb_error);
+
+	prom_halt();
+}
+
+unsigned long sun4v_err_dtlb_vaddr;
+unsigned long sun4v_err_dtlb_ctx;
+unsigned long sun4v_err_dtlb_pte;
+unsigned long sun4v_err_dtlb_error;
+
+void sun4v_dtlb_error_report(struct pt_regs *regs, int tl)
+{
+	if (tl > 1)
+		dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
+
+	printk(KERN_EMERG "SUN4V-DTLB: Error at TPC[%lx], tl %d\n",
+	       regs->tpc, tl);
+	printk(KERN_EMERG "SUN4V-DTLB: vaddr[%lx] ctx[%lx] "
+	       "pte[%lx] error[%lx]\n",
+	       sun4v_err_dtlb_vaddr, sun4v_err_dtlb_ctx,
+	       sun4v_err_dtlb_pte, sun4v_err_dtlb_error);
+
+	prom_halt();
+}
+
+void hypervisor_tlbop_error(unsigned long err, unsigned long op)
+{
+	printk(KERN_CRIT "SUN4V: TLB hv call error %lu for op %lu\n",
+	       err, op);
+}
+
+void hypervisor_tlbop_error_xcall(unsigned long err, unsigned long op)
+{
+	printk(KERN_CRIT "SUN4V: XCALL TLB hv call error %lu for op %lu\n",
+	       err, op);
+}
+
 void do_fpe_common(struct pt_regs *regs)
 void do_fpe_common(struct pt_regs *regs)
 {
 {
 	if (regs->tstate & TSTATE_PRIV) {
 	if (regs->tstate & TSTATE_PRIV) {
@@ -1924,10 +2238,11 @@ void die_if_kernel(char *str, struct pt_regs *regs)
 		}
 		}
 		user_instruction_dump ((unsigned int __user *) regs->tpc);
 		user_instruction_dump ((unsigned int __user *) regs->tpc);
 	}
 	}
+#if 0
 #ifdef CONFIG_SMP
 #ifdef CONFIG_SMP
 	smp_report_regs();
 	smp_report_regs();
 #endif
 #endif
-                                                	
+#endif                                                	
 	if (regs->tstate & TSTATE_PRIV)
 	if (regs->tstate & TSTATE_PRIV)
 		do_exit(SIGKILL);
 		do_exit(SIGKILL);
 	do_exit(SIGSEGV);
 	do_exit(SIGSEGV);
@@ -1958,6 +2273,11 @@ void do_illegal_instruction(struct pt_regs *regs)
 		} else if ((insn & 0xc1580000) == 0xc1100000) /* LDQ/STQ */ {
 		} else if ((insn & 0xc1580000) == 0xc1100000) /* LDQ/STQ */ {
 			if (handle_ldf_stq(insn, regs))
 			if (handle_ldf_stq(insn, regs))
 				return;
 				return;
+		} else if (tlb_type == hypervisor) {
+			extern int vis_emul(struct pt_regs *, unsigned int);
+
+			if (!vis_emul(regs, insn))
+				return;
 		}
 		}
 	}
 	}
 	info.si_signo = SIGILL;
 	info.si_signo = SIGILL;
@@ -1968,6 +2288,8 @@ void do_illegal_instruction(struct pt_regs *regs)
 	force_sig_info(SIGILL, &info, current);
 	force_sig_info(SIGILL, &info, current);
 }
 }
 
 
+extern void kernel_unaligned_trap(struct pt_regs *regs, unsigned int insn);
+
 void mem_address_unaligned(struct pt_regs *regs, unsigned long sfar, unsigned long sfsr)
 void mem_address_unaligned(struct pt_regs *regs, unsigned long sfar, unsigned long sfsr)
 {
 {
 	siginfo_t info;
 	siginfo_t info;
@@ -1977,13 +2299,7 @@ void mem_address_unaligned(struct pt_regs *regs, unsigned long sfar, unsigned lo
 		return;
 		return;
 
 
 	if (regs->tstate & TSTATE_PRIV) {
 	if (regs->tstate & TSTATE_PRIV) {
-		extern void kernel_unaligned_trap(struct pt_regs *regs,
-						  unsigned int insn, 
-						  unsigned long sfar,
-						  unsigned long sfsr);
-
-		kernel_unaligned_trap(regs, *((unsigned int *)regs->tpc),
-				      sfar, sfsr);
+		kernel_unaligned_trap(regs, *((unsigned int *)regs->tpc));
 		return;
 		return;
 	}
 	}
 	info.si_signo = SIGBUS;
 	info.si_signo = SIGBUS;
@@ -1994,6 +2310,26 @@ void mem_address_unaligned(struct pt_regs *regs, unsigned long sfar, unsigned lo
 	force_sig_info(SIGBUS, &info, current);
 	force_sig_info(SIGBUS, &info, current);
 }
 }
 
 
+void sun4v_do_mna(struct pt_regs *regs, unsigned long addr, unsigned long type_ctx)
+{
+	siginfo_t info;
+
+	if (notify_die(DIE_TRAP, "memory address unaligned", regs,
+		       0, 0x34, SIGSEGV) == NOTIFY_STOP)
+		return;
+
+	if (regs->tstate & TSTATE_PRIV) {
+		kernel_unaligned_trap(regs, *((unsigned int *)regs->tpc));
+		return;
+	}
+	info.si_signo = SIGBUS;
+	info.si_errno = 0;
+	info.si_code = BUS_ADRALN;
+	info.si_addr = (void __user *) addr;
+	info.si_trapno = 0;
+	force_sig_info(SIGBUS, &info, current);
+}
+
 void do_privop(struct pt_regs *regs)
 void do_privop(struct pt_regs *regs)
 {
 {
 	siginfo_t info;
 	siginfo_t info;
@@ -2130,7 +2466,22 @@ void do_getpsr(struct pt_regs *regs)
 	}
 	}
 }
 }
 
 
+struct trap_per_cpu trap_block[NR_CPUS];
+
+/* This can get invoked before sched_init() so play it super safe
+ * and use hard_smp_processor_id().
+ */
+void init_cur_cpu_trap(struct thread_info *t)
+{
+	int cpu = hard_smp_processor_id();
+	struct trap_per_cpu *p = &trap_block[cpu];
+
+	p->thread = t;
+	p->pgd_paddr = 0;
+}
+
 extern void thread_info_offsets_are_bolixed_dave(void);
 extern void thread_info_offsets_are_bolixed_dave(void);
+extern void trap_per_cpu_offsets_are_bolixed_dave(void);
 
 
 /* Only invoked on boot processor. */
 /* Only invoked on boot processor. */
 void __init trap_init(void)
 void __init trap_init(void)
@@ -2154,7 +2505,6 @@ void __init trap_init(void)
 	    TI_KERN_CNTD0 != offsetof(struct thread_info, kernel_cntd0) ||
 	    TI_KERN_CNTD0 != offsetof(struct thread_info, kernel_cntd0) ||
 	    TI_KERN_CNTD1 != offsetof(struct thread_info, kernel_cntd1) ||
 	    TI_KERN_CNTD1 != offsetof(struct thread_info, kernel_cntd1) ||
 	    TI_PCR != offsetof(struct thread_info, pcr_reg) ||
 	    TI_PCR != offsetof(struct thread_info, pcr_reg) ||
-	    TI_CEE_STUFF != offsetof(struct thread_info, cee_stuff) ||
 	    TI_PRE_COUNT != offsetof(struct thread_info, preempt_count) ||
 	    TI_PRE_COUNT != offsetof(struct thread_info, preempt_count) ||
 	    TI_NEW_CHILD != offsetof(struct thread_info, new_child) ||
 	    TI_NEW_CHILD != offsetof(struct thread_info, new_child) ||
 	    TI_SYS_NOERROR != offsetof(struct thread_info, syscall_noerror) ||
 	    TI_SYS_NOERROR != offsetof(struct thread_info, syscall_noerror) ||
@@ -2165,6 +2515,29 @@ void __init trap_init(void)
 	    (TI_FPREGS & (64 - 1)))
 	    (TI_FPREGS & (64 - 1)))
 		thread_info_offsets_are_bolixed_dave();
 		thread_info_offsets_are_bolixed_dave();
 
 
+	if (TRAP_PER_CPU_THREAD != offsetof(struct trap_per_cpu, thread) ||
+	    (TRAP_PER_CPU_PGD_PADDR !=
+	     offsetof(struct trap_per_cpu, pgd_paddr)) ||
+	    (TRAP_PER_CPU_CPU_MONDO_PA !=
+	     offsetof(struct trap_per_cpu, cpu_mondo_pa)) ||
+	    (TRAP_PER_CPU_DEV_MONDO_PA !=
+	     offsetof(struct trap_per_cpu, dev_mondo_pa)) ||
+	    (TRAP_PER_CPU_RESUM_MONDO_PA !=
+	     offsetof(struct trap_per_cpu, resum_mondo_pa)) ||
+	    (TRAP_PER_CPU_RESUM_KBUF_PA !=
+	     offsetof(struct trap_per_cpu, resum_kernel_buf_pa)) ||
+	    (TRAP_PER_CPU_NONRESUM_MONDO_PA !=
+	     offsetof(struct trap_per_cpu, nonresum_mondo_pa)) ||
+	    (TRAP_PER_CPU_NONRESUM_KBUF_PA !=
+	     offsetof(struct trap_per_cpu, nonresum_kernel_buf_pa)) ||
+	    (TRAP_PER_CPU_FAULT_INFO !=
+	     offsetof(struct trap_per_cpu, fault_info)) ||
+	    (TRAP_PER_CPU_CPU_MONDO_BLOCK_PA !=
+	     offsetof(struct trap_per_cpu, cpu_mondo_block_pa)) ||
+	    (TRAP_PER_CPU_CPU_LIST_PA !=
+	     offsetof(struct trap_per_cpu, cpu_list_pa)))
+		trap_per_cpu_offsets_are_bolixed_dave();
+
 	/* Attach to the address space of init_task.  On SMP we
 	/* Attach to the address space of init_task.  On SMP we
 	 * do this in smp.c:smp_callin for other cpus.
 	 * do this in smp.c:smp_callin for other cpus.
 	 */
 	 */

+ 442 - 0
arch/sparc64/kernel/tsb.S

@@ -0,0 +1,442 @@
+/* tsb.S: Sparc64 TSB table handling.
+ *
+ * Copyright (C) 2006 David S. Miller <davem@davemloft.net>
+ */
+
+#include <asm/tsb.h>
+#include <asm/hypervisor.h>
+
+	.text
+	.align	32
+
+	/* Invoked from TLB miss handler, we are in the
+	 * MMU global registers and they are setup like
+	 * this:
+	 *
+	 * %g1: TSB entry pointer
+	 * %g2:	available temporary
+	 * %g3:	FAULT_CODE_{D,I}TLB
+	 * %g4:	available temporary
+	 * %g5:	available temporary
+	 * %g6: TAG TARGET
+	 * %g7:	available temporary, will be loaded by us with
+	 *      the physical address base of the linux page
+	 *      tables for the current address space
+	 */
+tsb_miss_dtlb:
+	mov		TLB_TAG_ACCESS, %g4
+	ba,pt		%xcc, tsb_miss_page_table_walk
+	 ldxa		[%g4] ASI_DMMU, %g4
+
+tsb_miss_itlb:
+	mov		TLB_TAG_ACCESS, %g4
+	ba,pt		%xcc, tsb_miss_page_table_walk
+	 ldxa		[%g4] ASI_IMMU, %g4
+
+	/* At this point we have:
+	 * %g1 --	TSB entry address
+	 * %g3 --	FAULT_CODE_{D,I}TLB
+	 * %g4 --	missing virtual address
+	 * %g6 --	TAG TARGET (vaddr >> 22)
+	 */
+tsb_miss_page_table_walk:
+	TRAP_LOAD_PGD_PHYS(%g7, %g5)
+
+	/* And now we have the PGD base physical address in %g7.  */
+tsb_miss_page_table_walk_sun4v_fastpath:
+	USER_PGTABLE_WALK_TL1(%g4, %g7, %g5, %g2, tsb_do_fault)
+
+	/* At this point we have:
+	 * %g1 --	TSB entry address
+	 * %g3 --	FAULT_CODE_{D,I}TLB
+	 * %g5 --	physical address of PTE in Linux page tables
+	 * %g6 --	TAG TARGET (vaddr >> 22)
+	 */
+tsb_reload:
+	TSB_LOCK_TAG(%g1, %g2, %g7)
+
+	/* Load and check PTE.  */
+	ldxa		[%g5] ASI_PHYS_USE_EC, %g5
+	mov		1, %g7
+	sllx		%g7, TSB_TAG_INVALID_BIT, %g7
+	brgez,a,pn	%g5, tsb_do_fault
+	 TSB_STORE(%g1, %g7)
+
+	TSB_WRITE(%g1, %g5, %g6)
+
+	/* Finally, load TLB and return from trap.  */
+tsb_tlb_reload:
+	cmp		%g3, FAULT_CODE_DTLB
+	bne,pn		%xcc, tsb_itlb_load
+	 nop
+
+tsb_dtlb_load:
+
+661:	stxa		%g5, [%g0] ASI_DTLB_DATA_IN
+	retry
+	.section	.sun4v_2insn_patch, "ax"
+	.word		661b
+	nop
+	nop
+	.previous
+
+	/* For sun4v the ASI_DTLB_DATA_IN store and the retry
+	 * instruction get nop'd out and we get here to branch
+	 * to the sun4v tlb load code.  The registers are setup
+	 * as follows:
+	 *
+	 * %g4: vaddr
+	 * %g5: PTE
+	 * %g6:	TAG
+	 *
+	 * The sun4v TLB load wants the PTE in %g3 so we fix that
+	 * up here.
+	 */
+	ba,pt		%xcc, sun4v_dtlb_load
+	 mov		%g5, %g3
+
+tsb_itlb_load:
+	/* Executable bit must be set.  */
+661:	andcc		%g5, _PAGE_EXEC_4U, %g0
+	.section	.sun4v_1insn_patch, "ax"
+	.word		661b
+	andcc		%g5, _PAGE_EXEC_4V, %g0
+	.previous
+
+	be,pn		%xcc, tsb_do_fault
+	 nop
+
+661:	stxa		%g5, [%g0] ASI_ITLB_DATA_IN
+	retry
+	.section	.sun4v_2insn_patch, "ax"
+	.word		661b
+	nop
+	nop
+	.previous
+
+	/* For sun4v the ASI_ITLB_DATA_IN store and the retry
+	 * instruction get nop'd out and we get here to branch
+	 * to the sun4v tlb load code.  The registers are setup
+	 * as follows:
+	 *
+	 * %g4: vaddr
+	 * %g5: PTE
+	 * %g6:	TAG
+	 *
+	 * The sun4v TLB load wants the PTE in %g3 so we fix that
+	 * up here.
+	 */
+	ba,pt		%xcc, sun4v_itlb_load
+	 mov		%g5, %g3
+
+	/* No valid entry in the page tables, do full fault
+	 * processing.
+	 */
+
+	.globl		tsb_do_fault
+tsb_do_fault:
+	cmp		%g3, FAULT_CODE_DTLB
+
+661:	rdpr		%pstate, %g5
+	wrpr		%g5, PSTATE_AG | PSTATE_MG, %pstate
+	.section	.sun4v_2insn_patch, "ax"
+	.word		661b
+	SET_GL(1)
+	ldxa		[%g0] ASI_SCRATCHPAD, %g4
+	.previous
+
+	bne,pn		%xcc, tsb_do_itlb_fault
+	 nop
+
+tsb_do_dtlb_fault:
+	rdpr	%tl, %g3
+	cmp	%g3, 1
+
+661:	mov	TLB_TAG_ACCESS, %g4
+	ldxa	[%g4] ASI_DMMU, %g5
+	.section .sun4v_2insn_patch, "ax"
+	.word	661b
+	ldx	[%g4 + HV_FAULT_D_ADDR_OFFSET], %g5
+	nop
+	.previous
+
+	be,pt	%xcc, sparc64_realfault_common
+	 mov	FAULT_CODE_DTLB, %g4
+	ba,pt	%xcc, winfix_trampoline
+	 nop
+
+tsb_do_itlb_fault:
+	rdpr	%tpc, %g5
+	ba,pt	%xcc, sparc64_realfault_common
+	 mov	FAULT_CODE_ITLB, %g4
+
+	.globl	sparc64_realfault_common
+sparc64_realfault_common:
+	/* fault code in %g4, fault address in %g5, etrap will
+	 * preserve these two values in %l4 and %l5 respectively
+	 */
+	ba,pt	%xcc, etrap			! Save trap state
+1:	 rd	%pc, %g7			! ...
+	stb	%l4, [%g6 + TI_FAULT_CODE]	! Save fault code
+	stx	%l5, [%g6 + TI_FAULT_ADDR]	! Save fault address
+	call	do_sparc64_fault		! Call fault handler
+	 add	%sp, PTREGS_OFF, %o0		! Compute pt_regs arg
+	ba,pt	%xcc, rtrap_clr_l6		! Restore cpu state
+	 nop					! Delay slot (fill me)
+
+winfix_trampoline:
+	rdpr	%tpc, %g3			! Prepare winfixup TNPC
+	or	%g3, 0x7c, %g3			! Compute branch offset
+	wrpr	%g3, %tnpc			! Write it into TNPC
+	done					! Trap return
+
+	/* Insert an entry into the TSB.
+	 *
+	 * %o0: TSB entry pointer (virt or phys address)
+	 * %o1: tag
+	 * %o2:	pte
+	 */
+	.align	32
+	.globl	__tsb_insert
+__tsb_insert:
+	rdpr	%pstate, %o5
+	wrpr	%o5, PSTATE_IE, %pstate
+	TSB_LOCK_TAG(%o0, %g2, %g3)
+	TSB_WRITE(%o0, %o2, %o1)
+	wrpr	%o5, %pstate
+	retl
+	 nop
+	.size	__tsb_insert, .-__tsb_insert
+
+	/* Flush the given TSB entry if it has the matching
+	 * tag.
+	 *
+	 * %o0: TSB entry pointer (virt or phys address)
+	 * %o1:	tag
+	 */
+	.align	32
+	.globl	tsb_flush
+	.type	tsb_flush,#function
+tsb_flush:
+	sethi	%hi(TSB_TAG_LOCK_HIGH), %g2
+1:	TSB_LOAD_TAG(%o0, %g1)
+	srlx	%g1, 32, %o3
+	andcc	%o3, %g2, %g0
+	bne,pn	%icc, 1b
+	 membar	#LoadLoad
+	cmp	%g1, %o1
+	mov	1, %o3
+	bne,pt	%xcc, 2f
+	 sllx	%o3, TSB_TAG_INVALID_BIT, %o3
+	TSB_CAS_TAG(%o0, %g1, %o3)
+	cmp	%g1, %o3
+	bne,pn	%xcc, 1b
+	 nop
+2:	retl
+	 TSB_MEMBAR
+	.size	tsb_flush, .-tsb_flush
+
+	/* Reload MMU related context switch state at
+	 * schedule() time.
+	 *
+	 * %o0: page table physical address
+	 * %o1:	TSB register value
+	 * %o2:	TSB virtual address
+	 * %o3:	TSB mapping locked PTE
+	 * %o4:	Hypervisor TSB descriptor physical address
+	 *
+	 * We have to run this whole thing with interrupts
+	 * disabled so that the current cpu doesn't change
+	 * due to preemption.
+	 */
+	.align	32
+	.globl	__tsb_context_switch
+	.type	__tsb_context_switch,#function
+__tsb_context_switch:
+	rdpr	%pstate, %o5
+	wrpr	%o5, PSTATE_IE, %pstate
+
+	ldub	[%g6 + TI_CPU], %g1
+	sethi	%hi(trap_block), %g2
+	sllx	%g1, TRAP_BLOCK_SZ_SHIFT, %g1
+	or	%g2, %lo(trap_block), %g2
+	add	%g2, %g1, %g2
+	stx	%o0, [%g2 + TRAP_PER_CPU_PGD_PADDR]
+
+	sethi	%hi(tlb_type), %g1
+	lduw	[%g1 + %lo(tlb_type)], %g1
+	cmp	%g1, 3
+	bne,pt	%icc, 1f
+	 nop
+
+	/* Hypervisor TSB switch. */
+	mov	SCRATCHPAD_UTSBREG1, %g1
+	stxa	%o1, [%g1] ASI_SCRATCHPAD
+	mov	-1, %g2
+	mov	SCRATCHPAD_UTSBREG2, %g1
+	stxa	%g2, [%g1] ASI_SCRATCHPAD
+
+	/* Save away %o5's %pstate, we have to use %o5 for
+	 * the hypervisor call.
+	 */
+	mov	%o5, %g1
+
+	mov	HV_FAST_MMU_TSB_CTXNON0, %o5
+	mov	1, %o0
+	mov	%o4, %o1
+	ta	HV_FAST_TRAP
+
+	/* Finish up and restore %o5.  */
+	ba,pt	%xcc, 9f
+	 mov	%g1, %o5
+
+	/* SUN4U TSB switch.  */
+1:	mov	TSB_REG, %g1
+	stxa	%o1, [%g1] ASI_DMMU
+	membar	#Sync
+	stxa	%o1, [%g1] ASI_IMMU
+	membar	#Sync
+
+2:	brz	%o2, 9f
+	 nop
+
+	sethi	%hi(sparc64_highest_unlocked_tlb_ent), %g2
+	mov	TLB_TAG_ACCESS, %g1
+	lduw	[%g2 + %lo(sparc64_highest_unlocked_tlb_ent)], %g2
+	stxa	%o2, [%g1] ASI_DMMU
+	membar	#Sync
+	sllx	%g2, 3, %g2
+	stxa	%o3, [%g2] ASI_DTLB_DATA_ACCESS
+	membar	#Sync
+9:
+	wrpr	%o5, %pstate
+
+	retl
+	 nop
+	.size	__tsb_context_switch, .-__tsb_context_switch
+
+#define TSB_PASS_BITS	((1 << TSB_TAG_LOCK_BIT) | \
+			 (1 << TSB_TAG_INVALID_BIT))
+
+	.align	32
+	.globl	copy_tsb
+	.type	copy_tsb,#function
+copy_tsb:		/* %o0=old_tsb_base, %o1=old_tsb_size
+			 * %o2=new_tsb_base, %o3=new_tsb_size
+			 */
+	sethi		%uhi(TSB_PASS_BITS), %g7
+	srlx		%o3, 4, %o3
+	add		%o0, %o1, %g1	/* end of old tsb */
+	sllx		%g7, 32, %g7
+	sub		%o3, 1, %o3	/* %o3 == new tsb hash mask */
+
+661:	prefetcha	[%o0] ASI_N, #one_read
+	.section	.tsb_phys_patch, "ax"
+	.word		661b
+	prefetcha	[%o0] ASI_PHYS_USE_EC, #one_read
+	.previous
+
+90:	andcc		%o0, (64 - 1), %g0
+	bne		1f
+	 add		%o0, 64, %o5
+
+661:	prefetcha	[%o5] ASI_N, #one_read
+	.section	.tsb_phys_patch, "ax"
+	.word		661b
+	prefetcha	[%o5] ASI_PHYS_USE_EC, #one_read
+	.previous
+
+1:	TSB_LOAD_QUAD(%o0, %g2)		/* %g2/%g3 == TSB entry */
+	andcc		%g2, %g7, %g0	/* LOCK or INVALID set? */
+	bne,pn		%xcc, 80f	/* Skip it */
+	 sllx		%g2, 22, %o4	/* TAG --> VADDR */
+
+	/* This can definitely be computed faster... */
+	srlx		%o0, 4, %o5	/* Build index */
+	and		%o5, 511, %o5	/* Mask index */
+	sllx		%o5, PAGE_SHIFT, %o5 /* Put into vaddr position */
+	or		%o4, %o5, %o4	/* Full VADDR. */
+	srlx		%o4, PAGE_SHIFT, %o4 /* Shift down to create index */
+	and		%o4, %o3, %o4	/* Mask with new_tsb_nents-1 */
+	sllx		%o4, 4, %o4	/* Shift back up into tsb ent offset */
+	TSB_STORE(%o2 + %o4, %g2)	/* Store TAG */
+	add		%o4, 0x8, %o4	/* Advance to TTE */
+	TSB_STORE(%o2 + %o4, %g3)	/* Store TTE */
+
+80:	add		%o0, 16, %o0
+	cmp		%o0, %g1
+	bne,pt		%xcc, 90b
+	 nop
+
+	retl
+	 TSB_MEMBAR
+	.size		copy_tsb, .-copy_tsb
+
+	/* Set the invalid bit in all TSB entries.  */
+	.align		32
+	.globl		tsb_init
+	.type		tsb_init,#function
+tsb_init:		/* %o0 = TSB vaddr, %o1 = size in bytes */
+	prefetch	[%o0 + 0x000], #n_writes
+	mov		1, %g1
+	prefetch	[%o0 + 0x040], #n_writes
+	sllx		%g1, TSB_TAG_INVALID_BIT, %g1
+	prefetch	[%o0 + 0x080], #n_writes
+1:	prefetch	[%o0 + 0x0c0], #n_writes
+	stx		%g1, [%o0 + 0x00]
+	stx		%g1, [%o0 + 0x10]
+	stx		%g1, [%o0 + 0x20]
+	stx		%g1, [%o0 + 0x30]
+	prefetch	[%o0 + 0x100], #n_writes
+	stx		%g1, [%o0 + 0x40]
+	stx		%g1, [%o0 + 0x50]
+	stx		%g1, [%o0 + 0x60]
+	stx		%g1, [%o0 + 0x70]
+	prefetch	[%o0 + 0x140], #n_writes
+	stx		%g1, [%o0 + 0x80]
+	stx		%g1, [%o0 + 0x90]
+	stx		%g1, [%o0 + 0xa0]
+	stx		%g1, [%o0 + 0xb0]
+	prefetch	[%o0 + 0x180], #n_writes
+	stx		%g1, [%o0 + 0xc0]
+	stx		%g1, [%o0 + 0xd0]
+	stx		%g1, [%o0 + 0xe0]
+	stx		%g1, [%o0 + 0xf0]
+	subcc		%o1, 0x100, %o1
+	bne,pt		%xcc, 1b
+	 add		%o0, 0x100, %o0
+	retl
+	 nop
+	nop
+	nop
+	.size		tsb_init, .-tsb_init
+
+	.globl		NGtsb_init
+	.type		NGtsb_init,#function
+NGtsb_init:
+	rd		%asi, %g2
+	mov		1, %g1
+	wr		%g0, ASI_BLK_INIT_QUAD_LDD_P, %asi
+	sllx		%g1, TSB_TAG_INVALID_BIT, %g1
+1:	stxa		%g1, [%o0 + 0x00] %asi
+	stxa		%g1, [%o0 + 0x10] %asi
+	stxa		%g1, [%o0 + 0x20] %asi
+	stxa		%g1, [%o0 + 0x30] %asi
+	stxa		%g1, [%o0 + 0x40] %asi
+	stxa		%g1, [%o0 + 0x50] %asi
+	stxa		%g1, [%o0 + 0x60] %asi
+	stxa		%g1, [%o0 + 0x70] %asi
+	stxa		%g1, [%o0 + 0x80] %asi
+	stxa		%g1, [%o0 + 0x90] %asi
+	stxa		%g1, [%o0 + 0xa0] %asi
+	stxa		%g1, [%o0 + 0xb0] %asi
+	stxa		%g1, [%o0 + 0xc0] %asi
+	stxa		%g1, [%o0 + 0xd0] %asi
+	stxa		%g1, [%o0 + 0xe0] %asi
+	stxa		%g1, [%o0 + 0xf0] %asi
+	subcc		%o1, 0x100, %o1
+	bne,pt		%xcc, 1b
+	 add		%o0, 0x100, %o0
+	retl
+	 wr		%g2, 0x0, %asi
+	.size		NGtsb_init, .-NGtsb_init

+ 25 - 38
arch/sparc64/kernel/ttable.S

@@ -1,7 +1,6 @@
-/* $Id: ttable.S,v 1.38 2002/02/09 19:49:30 davem Exp $
- * ttable.S: Sparc V9 Trap Table(s) with SpitFire/Cheetah extensions.
+/* ttable.S: Sparc V9 Trap Table(s) with SpitFire/Cheetah/SUN4V extensions.
  *
  *
- * Copyright (C) 1996, 2001 David S. Miller (davem@caip.rutgers.edu)
+ * Copyright (C) 1996, 2001, 2006 David S. Miller (davem@davemloft.net)
  */
  */
 
 
 #include <linux/config.h>
 #include <linux/config.h>
@@ -19,7 +18,7 @@ tl0_resv000:	BOOT_KERNEL BTRAP(0x1) BTRAP(0x2) BTRAP(0x3)
 tl0_resv004:	BTRAP(0x4)  BTRAP(0x5) BTRAP(0x6) BTRAP(0x7)
 tl0_resv004:	BTRAP(0x4)  BTRAP(0x5) BTRAP(0x6) BTRAP(0x7)
 tl0_iax:	membar #Sync
 tl0_iax:	membar #Sync
 		TRAP_NOSAVE_7INSNS(__spitfire_insn_access_exception)
 		TRAP_NOSAVE_7INSNS(__spitfire_insn_access_exception)
-tl0_resv009:	BTRAP(0x9)
+tl0_itsb_4v:	SUN4V_ITSB_MISS
 tl0_iae:	membar #Sync
 tl0_iae:	membar #Sync
 		TRAP_NOSAVE_7INSNS(__spitfire_access_error)
 		TRAP_NOSAVE_7INSNS(__spitfire_access_error)
 tl0_resv00b:	BTRAP(0xb) BTRAP(0xc) BTRAP(0xd) BTRAP(0xe) BTRAP(0xf)
 tl0_resv00b:	BTRAP(0xb) BTRAP(0xc) BTRAP(0xd) BTRAP(0xe) BTRAP(0xf)
@@ -38,7 +37,7 @@ tl0_div0:	TRAP(do_div0)
 tl0_resv029:	BTRAP(0x29) BTRAP(0x2a) BTRAP(0x2b) BTRAP(0x2c) BTRAP(0x2d) BTRAP(0x2e)
 tl0_resv029:	BTRAP(0x29) BTRAP(0x2a) BTRAP(0x2b) BTRAP(0x2c) BTRAP(0x2d) BTRAP(0x2e)
 tl0_resv02f:	BTRAP(0x2f)
 tl0_resv02f:	BTRAP(0x2f)
 tl0_dax:	TRAP_NOSAVE(__spitfire_data_access_exception)
 tl0_dax:	TRAP_NOSAVE(__spitfire_data_access_exception)
-tl0_resv031:	BTRAP(0x31)
+tl0_dtsb_4v:	SUN4V_DTSB_MISS
 tl0_dae:	membar #Sync
 tl0_dae:	membar #Sync
 		TRAP_NOSAVE_7INSNS(__spitfire_access_error)
 		TRAP_NOSAVE_7INSNS(__spitfire_access_error)
 tl0_resv033:	BTRAP(0x33)
 tl0_resv033:	BTRAP(0x33)
@@ -52,12 +51,13 @@ tl0_resv03e:	BTRAP(0x3e) BTRAP(0x3f) BTRAP(0x40)
 tl0_irq1:	TRAP_IRQ(smp_call_function_client, 1)
 tl0_irq1:	TRAP_IRQ(smp_call_function_client, 1)
 tl0_irq2:	TRAP_IRQ(smp_receive_signal_client, 2)
 tl0_irq2:	TRAP_IRQ(smp_receive_signal_client, 2)
 tl0_irq3:	TRAP_IRQ(smp_penguin_jailcell, 3)
 tl0_irq3:	TRAP_IRQ(smp_penguin_jailcell, 3)
+tl0_irq4:	TRAP_IRQ(smp_new_mmu_context_version_client, 4)
 #else
 #else
 tl0_irq1:	BTRAP(0x41)
 tl0_irq1:	BTRAP(0x41)
 tl0_irq2:	BTRAP(0x42)
 tl0_irq2:	BTRAP(0x42)
 tl0_irq3:	BTRAP(0x43)
 tl0_irq3:	BTRAP(0x43)
+tl0_irq4:	BTRAP(0x44)
 #endif
 #endif
-tl0_irq4:	TRAP_IRQ(handler_irq, 4)
 tl0_irq5:	TRAP_IRQ(handler_irq, 5)  TRAP_IRQ(handler_irq, 6)
 tl0_irq5:	TRAP_IRQ(handler_irq, 5)  TRAP_IRQ(handler_irq, 6)
 tl0_irq7:	TRAP_IRQ(handler_irq, 7)  TRAP_IRQ(handler_irq, 8)
 tl0_irq7:	TRAP_IRQ(handler_irq, 7)  TRAP_IRQ(handler_irq, 8)
 tl0_irq9:	TRAP_IRQ(handler_irq, 9)  TRAP_IRQ(handler_irq, 10)
 tl0_irq9:	TRAP_IRQ(handler_irq, 9)  TRAP_IRQ(handler_irq, 10)
@@ -78,9 +78,9 @@ tl0_vaw:	TRAP(do_vaw)
 tl0_cee:	membar #Sync
 tl0_cee:	membar #Sync
 		TRAP_NOSAVE_7INSNS(__spitfire_cee_trap)
 		TRAP_NOSAVE_7INSNS(__spitfire_cee_trap)
 tl0_iamiss:
 tl0_iamiss:
-#include	"itlb_base.S"
+#include	"itlb_miss.S"
 tl0_damiss:
 tl0_damiss:
-#include	"dtlb_base.S"
+#include	"dtlb_miss.S"
 tl0_daprot:
 tl0_daprot:
 #include	"dtlb_prot.S"
 #include	"dtlb_prot.S"
 tl0_fecc:	BTRAP(0x70)	/* Fast-ECC on Cheetah */
 tl0_fecc:	BTRAP(0x70)	/* Fast-ECC on Cheetah */
@@ -88,15 +88,18 @@ tl0_dcpe:	BTRAP(0x71)	/* D-cache Parity Error on Cheetah+ */
 tl0_icpe:	BTRAP(0x72)	/* I-cache Parity Error on Cheetah+ */
 tl0_icpe:	BTRAP(0x72)	/* I-cache Parity Error on Cheetah+ */
 tl0_resv073:	BTRAP(0x73) BTRAP(0x74) BTRAP(0x75)
 tl0_resv073:	BTRAP(0x73) BTRAP(0x74) BTRAP(0x75)
 tl0_resv076:	BTRAP(0x76) BTRAP(0x77) BTRAP(0x78) BTRAP(0x79) BTRAP(0x7a) BTRAP(0x7b)
 tl0_resv076:	BTRAP(0x76) BTRAP(0x77) BTRAP(0x78) BTRAP(0x79) BTRAP(0x7a) BTRAP(0x7b)
-tl0_resv07c:	BTRAP(0x7c) BTRAP(0x7d) BTRAP(0x7e) BTRAP(0x7f)
+tl0_cpu_mondo:	TRAP_NOSAVE(sun4v_cpu_mondo)
+tl0_dev_mondo:	TRAP_NOSAVE(sun4v_dev_mondo)
+tl0_res_mondo:	TRAP_NOSAVE(sun4v_res_mondo)
+tl0_nres_mondo:	TRAP_NOSAVE(sun4v_nonres_mondo)
 tl0_s0n:	SPILL_0_NORMAL
 tl0_s0n:	SPILL_0_NORMAL
 tl0_s1n:	SPILL_1_NORMAL
 tl0_s1n:	SPILL_1_NORMAL
 tl0_s2n:	SPILL_2_NORMAL
 tl0_s2n:	SPILL_2_NORMAL
-tl0_s3n:	SPILL_3_NORMAL
-tl0_s4n:	SPILL_4_NORMAL
-tl0_s5n:	SPILL_5_NORMAL
-tl0_s6n:	SPILL_6_NORMAL
-tl0_s7n:	SPILL_7_NORMAL
+tl0_s3n:	SPILL_0_NORMAL_ETRAP
+tl0_s4n:	SPILL_1_GENERIC_ETRAP
+tl0_s5n:	SPILL_1_GENERIC_ETRAP_FIXUP
+tl0_s6n:	SPILL_2_GENERIC_ETRAP
+tl0_s7n:	SPILL_2_GENERIC_ETRAP_FIXUP
 tl0_s0o:	SPILL_0_OTHER
 tl0_s0o:	SPILL_0_OTHER
 tl0_s1o:	SPILL_1_OTHER
 tl0_s1o:	SPILL_1_OTHER
 tl0_s2o:	SPILL_2_OTHER
 tl0_s2o:	SPILL_2_OTHER
@@ -110,9 +113,9 @@ tl0_f1n:	FILL_1_NORMAL
 tl0_f2n:	FILL_2_NORMAL
 tl0_f2n:	FILL_2_NORMAL
 tl0_f3n:	FILL_3_NORMAL
 tl0_f3n:	FILL_3_NORMAL
 tl0_f4n:	FILL_4_NORMAL
 tl0_f4n:	FILL_4_NORMAL
-tl0_f5n:	FILL_5_NORMAL
-tl0_f6n:	FILL_6_NORMAL
-tl0_f7n:	FILL_7_NORMAL
+tl0_f5n:	FILL_0_NORMAL_RTRAP
+tl0_f6n:	FILL_1_GENERIC_RTRAP
+tl0_f7n:	FILL_2_GENERIC_RTRAP
 tl0_f0o:	FILL_0_OTHER
 tl0_f0o:	FILL_0_OTHER
 tl0_f1o:	FILL_1_OTHER
 tl0_f1o:	FILL_1_OTHER
 tl0_f2o:	FILL_2_OTHER
 tl0_f2o:	FILL_2_OTHER
@@ -128,7 +131,7 @@ tl0_flushw:	FLUSH_WINDOW_TRAP
 tl0_resv104:	BTRAP(0x104) BTRAP(0x105) BTRAP(0x106) BTRAP(0x107)
 tl0_resv104:	BTRAP(0x104) BTRAP(0x105) BTRAP(0x106) BTRAP(0x107)
 		.globl tl0_solaris
 		.globl tl0_solaris
 tl0_solaris:	SOLARIS_SYSCALL_TRAP
 tl0_solaris:	SOLARIS_SYSCALL_TRAP
-tl0_netbsd:	NETBSD_SYSCALL_TRAP
+tl0_resv109:	BTRAP(0x109)
 tl0_resv10a:	BTRAP(0x10a) BTRAP(0x10b) BTRAP(0x10c) BTRAP(0x10d) BTRAP(0x10e)
 tl0_resv10a:	BTRAP(0x10a) BTRAP(0x10b) BTRAP(0x10c) BTRAP(0x10d) BTRAP(0x10e)
 tl0_resv10f:	BTRAP(0x10f)
 tl0_resv10f:	BTRAP(0x10f)
 tl0_linux32:	LINUX_32BIT_SYSCALL_TRAP
 tl0_linux32:	LINUX_32BIT_SYSCALL_TRAP
@@ -179,7 +182,7 @@ sparc64_ttable_tl1:
 tl1_resv000:	BOOT_KERNEL    BTRAPTL1(0x1) BTRAPTL1(0x2) BTRAPTL1(0x3)
 tl1_resv000:	BOOT_KERNEL    BTRAPTL1(0x1) BTRAPTL1(0x2) BTRAPTL1(0x3)
 tl1_resv004:	BTRAPTL1(0x4)  BTRAPTL1(0x5) BTRAPTL1(0x6) BTRAPTL1(0x7)
 tl1_resv004:	BTRAPTL1(0x4)  BTRAPTL1(0x5) BTRAPTL1(0x6) BTRAPTL1(0x7)
 tl1_iax:	TRAP_NOSAVE(__spitfire_insn_access_exception_tl1)
 tl1_iax:	TRAP_NOSAVE(__spitfire_insn_access_exception_tl1)
-tl1_resv009:	BTRAPTL1(0x9)
+tl1_itsb_4v:	SUN4V_ITSB_MISS
 tl1_iae:	membar #Sync
 tl1_iae:	membar #Sync
 		TRAP_NOSAVE_7INSNS(__spitfire_access_error)
 		TRAP_NOSAVE_7INSNS(__spitfire_access_error)
 tl1_resv00b:	BTRAPTL1(0xb) BTRAPTL1(0xc) BTRAPTL1(0xd) BTRAPTL1(0xe) BTRAPTL1(0xf)
 tl1_resv00b:	BTRAPTL1(0xb) BTRAPTL1(0xc) BTRAPTL1(0xd) BTRAPTL1(0xe) BTRAPTL1(0xf)
@@ -198,7 +201,7 @@ tl1_div0:	TRAPTL1(do_div0_tl1)
 tl1_resv029:	BTRAPTL1(0x29) BTRAPTL1(0x2a) BTRAPTL1(0x2b) BTRAPTL1(0x2c)
 tl1_resv029:	BTRAPTL1(0x29) BTRAPTL1(0x2a) BTRAPTL1(0x2b) BTRAPTL1(0x2c)
 tl1_resv02d:	BTRAPTL1(0x2d) BTRAPTL1(0x2e) BTRAPTL1(0x2f)
 tl1_resv02d:	BTRAPTL1(0x2d) BTRAPTL1(0x2e) BTRAPTL1(0x2f)
 tl1_dax:	TRAP_NOSAVE(__spitfire_data_access_exception_tl1)
 tl1_dax:	TRAP_NOSAVE(__spitfire_data_access_exception_tl1)
-tl1_resv031:	BTRAPTL1(0x31)
+tl1_dtsb_4v:	SUN4V_DTSB_MISS
 tl1_dae:	membar #Sync
 tl1_dae:	membar #Sync
 		TRAP_NOSAVE_7INSNS(__spitfire_access_error)
 		TRAP_NOSAVE_7INSNS(__spitfire_access_error)
 tl1_resv033:	BTRAPTL1(0x33)
 tl1_resv033:	BTRAPTL1(0x33)
@@ -222,26 +225,10 @@ tl1_resv05c:	BTRAPTL1(0x5c) BTRAPTL1(0x5d) BTRAPTL1(0x5e) BTRAPTL1(0x5f)
 tl1_ivec:	TRAP_IVEC
 tl1_ivec:	TRAP_IVEC
 tl1_paw:	TRAPTL1(do_paw_tl1)
 tl1_paw:	TRAPTL1(do_paw_tl1)
 tl1_vaw:	TRAPTL1(do_vaw_tl1)
 tl1_vaw:	TRAPTL1(do_vaw_tl1)
-
-		/* The grotty trick to save %g1 into current->thread.cee_stuff
-		 * is because when we take this trap we could be interrupting
-		 * trap code already using the trap alternate global registers.
-		 *
-		 * We cross our fingers and pray that this store/load does
-		 * not cause yet another CEE trap.
-		 */
-tl1_cee:	membar	#Sync
-		stx	%g1, [%g6 + TI_CEE_STUFF]
-		ldxa	[%g0] ASI_AFSR, %g1
-		membar	#Sync
-		stxa	%g1, [%g0] ASI_AFSR
-		membar	#Sync
-		ldx	[%g6 + TI_CEE_STUFF], %g1
-		retry
-
+tl1_cee:	BTRAPTL1(0x63)
 tl1_iamiss:	BTRAPTL1(0x64) BTRAPTL1(0x65) BTRAPTL1(0x66) BTRAPTL1(0x67)
 tl1_iamiss:	BTRAPTL1(0x64) BTRAPTL1(0x65) BTRAPTL1(0x66) BTRAPTL1(0x67)
 tl1_damiss:
 tl1_damiss:
-#include	"dtlb_backend.S"
+#include	"dtlb_miss.S"
 tl1_daprot:
 tl1_daprot:
 #include	"dtlb_prot.S"
 #include	"dtlb_prot.S"
 tl1_fecc:	BTRAPTL1(0x70)	/* Fast-ECC on Cheetah */
 tl1_fecc:	BTRAPTL1(0x70)	/* Fast-ECC on Cheetah */

+ 34 - 11
arch/sparc64/kernel/unaligned.c

@@ -277,7 +277,7 @@ static void kernel_mna_trap_fault(void)
 	regs->tstate |= (ASI_AIUS << 24UL);
 	regs->tstate |= (ASI_AIUS << 24UL);
 }
 }
 
 
-asmlinkage void kernel_unaligned_trap(struct pt_regs *regs, unsigned int insn, unsigned long sfar, unsigned long sfsr)
+asmlinkage void kernel_unaligned_trap(struct pt_regs *regs, unsigned int insn)
 {
 {
 	enum direction dir = decode_direction(insn);
 	enum direction dir = decode_direction(insn);
 	int size = decode_access_size(insn);
 	int size = decode_access_size(insn);
@@ -405,6 +405,9 @@ extern void do_privact(struct pt_regs *regs);
 extern void spitfire_data_access_exception(struct pt_regs *regs,
 extern void spitfire_data_access_exception(struct pt_regs *regs,
 					   unsigned long sfsr,
 					   unsigned long sfsr,
 					   unsigned long sfar);
 					   unsigned long sfar);
+extern void sun4v_data_access_exception(struct pt_regs *regs,
+					unsigned long addr,
+					unsigned long type_ctx);
 
 
 int handle_ldf_stq(u32 insn, struct pt_regs *regs)
 int handle_ldf_stq(u32 insn, struct pt_regs *regs)
 {
 {
@@ -447,14 +450,20 @@ int handle_ldf_stq(u32 insn, struct pt_regs *regs)
 				break;
 				break;
 			}
 			}
 		default:
 		default:
-			spitfire_data_access_exception(regs, 0, addr);
+			if (tlb_type == hypervisor)
+				sun4v_data_access_exception(regs, addr, 0);
+			else
+				spitfire_data_access_exception(regs, 0, addr);
 			return 1;
 			return 1;
 		}
 		}
 		if (put_user (first >> 32, (u32 __user *)addr) ||
 		if (put_user (first >> 32, (u32 __user *)addr) ||
 		    __put_user ((u32)first, (u32 __user *)(addr + 4)) ||
 		    __put_user ((u32)first, (u32 __user *)(addr + 4)) ||
 		    __put_user (second >> 32, (u32 __user *)(addr + 8)) ||
 		    __put_user (second >> 32, (u32 __user *)(addr + 8)) ||
 		    __put_user ((u32)second, (u32 __user *)(addr + 12))) {
 		    __put_user ((u32)second, (u32 __user *)(addr + 12))) {
-		    	spitfire_data_access_exception(regs, 0, addr);
+			if (tlb_type == hypervisor)
+				sun4v_data_access_exception(regs, addr, 0);
+			else
+				spitfire_data_access_exception(regs, 0, addr);
 		    	return 1;
 		    	return 1;
 		}
 		}
 	} else {
 	} else {
@@ -467,7 +476,10 @@ int handle_ldf_stq(u32 insn, struct pt_regs *regs)
 			do_privact(regs);
 			do_privact(regs);
 			return 1;
 			return 1;
 		} else if (asi > ASI_SNFL) {
 		} else if (asi > ASI_SNFL) {
-			spitfire_data_access_exception(regs, 0, addr);
+			if (tlb_type == hypervisor)
+				sun4v_data_access_exception(regs, addr, 0);
+			else
+				spitfire_data_access_exception(regs, 0, addr);
 			return 1;
 			return 1;
 		}
 		}
 		switch (insn & 0x180000) {
 		switch (insn & 0x180000) {
@@ -484,7 +496,10 @@ int handle_ldf_stq(u32 insn, struct pt_regs *regs)
 				err |= __get_user (data[i], (u32 __user *)(addr + 4*i));
 				err |= __get_user (data[i], (u32 __user *)(addr + 4*i));
 		}
 		}
 		if (err && !(asi & 0x2 /* NF */)) {
 		if (err && !(asi & 0x2 /* NF */)) {
-			spitfire_data_access_exception(regs, 0, addr);
+			if (tlb_type == hypervisor)
+				sun4v_data_access_exception(regs, addr, 0);
+			else
+				spitfire_data_access_exception(regs, 0, addr);
 			return 1;
 			return 1;
 		}
 		}
 		if (asi & 0x8) /* Little */ {
 		if (asi & 0x8) /* Little */ {
@@ -548,7 +563,7 @@ void handle_lddfmna(struct pt_regs *regs, unsigned long sfar, unsigned long sfsr
 	u32 insn;
 	u32 insn;
 	u32 first, second;
 	u32 first, second;
 	u64 value;
 	u64 value;
-	u8 asi, freg;
+	u8 freg;
 	int flag;
 	int flag;
 	struct fpustate *f = FPUSTATE;
 	struct fpustate *f = FPUSTATE;
 
 
@@ -557,7 +572,7 @@ void handle_lddfmna(struct pt_regs *regs, unsigned long sfar, unsigned long sfsr
 	if (test_thread_flag(TIF_32BIT))
 	if (test_thread_flag(TIF_32BIT))
 		pc = (u32)pc;
 		pc = (u32)pc;
 	if (get_user(insn, (u32 __user *) pc) != -EFAULT) {
 	if (get_user(insn, (u32 __user *) pc) != -EFAULT) {
-		asi = sfsr >> 16;
+		int asi = decode_asi(insn, regs);
 		if ((asi > ASI_SNFL) ||
 		if ((asi > ASI_SNFL) ||
 		    (asi < ASI_P))
 		    (asi < ASI_P))
 			goto daex;
 			goto daex;
@@ -587,7 +602,11 @@ void handle_lddfmna(struct pt_regs *regs, unsigned long sfar, unsigned long sfsr
 		*(u64 *)(f->regs + freg) = value;
 		*(u64 *)(f->regs + freg) = value;
 		current_thread_info()->fpsaved[0] |= flag;
 		current_thread_info()->fpsaved[0] |= flag;
 	} else {
 	} else {
-daex:		spitfire_data_access_exception(regs, sfsr, sfar);
+daex:
+		if (tlb_type == hypervisor)
+			sun4v_data_access_exception(regs, sfar, sfsr);
+		else
+			spitfire_data_access_exception(regs, sfsr, sfar);
 		return;
 		return;
 	}
 	}
 	advance(regs);
 	advance(regs);
@@ -600,7 +619,7 @@ void handle_stdfmna(struct pt_regs *regs, unsigned long sfar, unsigned long sfsr
 	unsigned long tstate = regs->tstate;
 	unsigned long tstate = regs->tstate;
 	u32 insn;
 	u32 insn;
 	u64 value;
 	u64 value;
-	u8 asi, freg;
+	u8 freg;
 	int flag;
 	int flag;
 	struct fpustate *f = FPUSTATE;
 	struct fpustate *f = FPUSTATE;
 
 
@@ -609,8 +628,8 @@ void handle_stdfmna(struct pt_regs *regs, unsigned long sfar, unsigned long sfsr
 	if (test_thread_flag(TIF_32BIT))
 	if (test_thread_flag(TIF_32BIT))
 		pc = (u32)pc;
 		pc = (u32)pc;
 	if (get_user(insn, (u32 __user *) pc) != -EFAULT) {
 	if (get_user(insn, (u32 __user *) pc) != -EFAULT) {
+		int asi = decode_asi(insn, regs);
 		freg = ((insn >> 25) & 0x1e) | ((insn >> 20) & 0x20);
 		freg = ((insn >> 25) & 0x1e) | ((insn >> 20) & 0x20);
-		asi = sfsr >> 16;
 		value = 0;
 		value = 0;
 		flag = (freg < 32) ? FPRS_DL : FPRS_DU;
 		flag = (freg < 32) ? FPRS_DL : FPRS_DU;
 		if ((asi > ASI_SNFL) ||
 		if ((asi > ASI_SNFL) ||
@@ -631,7 +650,11 @@ void handle_stdfmna(struct pt_regs *regs, unsigned long sfar, unsigned long sfsr
 		    __put_user ((u32)value, (u32 __user *)(sfar + 4)))
 		    __put_user ((u32)value, (u32 __user *)(sfar + 4)))
 			goto daex;
 			goto daex;
 	} else {
 	} else {
-daex:		spitfire_data_access_exception(regs, sfsr, sfar);
+daex:
+		if (tlb_type == hypervisor)
+			sun4v_data_access_exception(regs, sfar, sfsr);
+		else
+			spitfire_data_access_exception(regs, sfsr, sfar);
 		return;
 		return;
 	}
 	}
 	advance(regs);
 	advance(regs);

+ 5 - 6
arch/sparc64/kernel/us2e_cpufreq.c

@@ -346,6 +346,9 @@ static int __init us2e_freq_init(void)
 	unsigned long manuf, impl, ver;
 	unsigned long manuf, impl, ver;
 	int ret;
 	int ret;
 
 
+	if (tlb_type != spitfire)
+		return -ENODEV;
+
 	__asm__("rdpr %%ver, %0" : "=r" (ver));
 	__asm__("rdpr %%ver, %0" : "=r" (ver));
 	manuf = ((ver >> 48) & 0xffff);
 	manuf = ((ver >> 48) & 0xffff);
 	impl  = ((ver >> 32) & 0xffff);
 	impl  = ((ver >> 32) & 0xffff);
@@ -354,20 +357,16 @@ static int __init us2e_freq_init(void)
 		struct cpufreq_driver *driver;
 		struct cpufreq_driver *driver;
 
 
 		ret = -ENOMEM;
 		ret = -ENOMEM;
-		driver = kmalloc(sizeof(struct cpufreq_driver), GFP_KERNEL);
+		driver = kzalloc(sizeof(struct cpufreq_driver), GFP_KERNEL);
 		if (!driver)
 		if (!driver)
 			goto err_out;
 			goto err_out;
-		memset(driver, 0, sizeof(*driver));
 
 
-		us2e_freq_table = kmalloc(
+		us2e_freq_table = kzalloc(
 			(NR_CPUS * sizeof(struct us2e_freq_percpu_info)),
 			(NR_CPUS * sizeof(struct us2e_freq_percpu_info)),
 			GFP_KERNEL);
 			GFP_KERNEL);
 		if (!us2e_freq_table)
 		if (!us2e_freq_table)
 			goto err_out;
 			goto err_out;
 
 
-		memset(us2e_freq_table, 0,
-		       (NR_CPUS * sizeof(struct us2e_freq_percpu_info)));
-
 		driver->init = us2e_freq_cpu_init;
 		driver->init = us2e_freq_cpu_init;
 		driver->verify = us2e_freq_verify;
 		driver->verify = us2e_freq_verify;
 		driver->target = us2e_freq_target;
 		driver->target = us2e_freq_target;

+ 5 - 6
arch/sparc64/kernel/us3_cpufreq.c

@@ -203,6 +203,9 @@ static int __init us3_freq_init(void)
 	unsigned long manuf, impl, ver;
 	unsigned long manuf, impl, ver;
 	int ret;
 	int ret;
 
 
+	if (tlb_type != cheetah && tlb_type != cheetah_plus)
+		return -ENODEV;
+
 	__asm__("rdpr %%ver, %0" : "=r" (ver));
 	__asm__("rdpr %%ver, %0" : "=r" (ver));
 	manuf = ((ver >> 48) & 0xffff);
 	manuf = ((ver >> 48) & 0xffff);
 	impl  = ((ver >> 32) & 0xffff);
 	impl  = ((ver >> 32) & 0xffff);
@@ -215,20 +218,16 @@ static int __init us3_freq_init(void)
 		struct cpufreq_driver *driver;
 		struct cpufreq_driver *driver;
 
 
 		ret = -ENOMEM;
 		ret = -ENOMEM;
-		driver = kmalloc(sizeof(struct cpufreq_driver), GFP_KERNEL);
+		driver = kzalloc(sizeof(struct cpufreq_driver), GFP_KERNEL);
 		if (!driver)
 		if (!driver)
 			goto err_out;
 			goto err_out;
-		memset(driver, 0, sizeof(*driver));
 
 
-		us3_freq_table = kmalloc(
+		us3_freq_table = kzalloc(
 			(NR_CPUS * sizeof(struct us3_freq_percpu_info)),
 			(NR_CPUS * sizeof(struct us3_freq_percpu_info)),
 			GFP_KERNEL);
 			GFP_KERNEL);
 		if (!us3_freq_table)
 		if (!us3_freq_table)
 			goto err_out;
 			goto err_out;
 
 
-		memset(us3_freq_table, 0,
-		       (NR_CPUS * sizeof(struct us3_freq_percpu_info)));
-
 		driver->init = us3_freq_cpu_init;
 		driver->init = us3_freq_cpu_init;
 		driver->verify = us3_freq_verify;
 		driver->verify = us3_freq_verify;
 		driver->target = us3_freq_target;
 		driver->target = us3_freq_target;

+ 894 - 0
arch/sparc64/kernel/visemul.c

@@ -0,0 +1,894 @@
+/* visemul.c: Emulation of VIS instructions.
+ *
+ * Copyright (C) 2006 David S. Miller (davem@davemloft.net)
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/thread_info.h>
+
+#include <asm/ptrace.h>
+#include <asm/pstate.h>
+#include <asm/system.h>
+#include <asm/fpumacro.h>
+#include <asm/uaccess.h>
+
+/* OPF field of various VIS instructions.  */
+
+/* 000111011 - four 16-bit packs  */
+#define FPACK16_OPF	0x03b
+
+/* 000111010 - two 32-bit packs  */
+#define FPACK32_OPF	0x03a
+
+/* 000111101 - four 16-bit packs  */
+#define FPACKFIX_OPF	0x03d
+
+/* 001001101 - four 16-bit expands  */
+#define FEXPAND_OPF	0x04d
+
+/* 001001011 - two 32-bit merges */
+#define FPMERGE_OPF	0x04b
+
+/* 000110001 - 8-by-16-bit partitoned product  */
+#define FMUL8x16_OPF	0x031
+
+/* 000110011 - 8-by-16-bit upper alpha partitioned product  */
+#define FMUL8x16AU_OPF	0x033
+
+/* 000110101 - 8-by-16-bit lower alpha partitioned product  */
+#define FMUL8x16AL_OPF	0x035
+
+/* 000110110 - upper 8-by-16-bit partitioned product  */
+#define FMUL8SUx16_OPF	0x036
+
+/* 000110111 - lower 8-by-16-bit partitioned product  */
+#define FMUL8ULx16_OPF	0x037
+
+/* 000111000 - upper 8-by-16-bit partitioned product  */
+#define FMULD8SUx16_OPF	0x038
+
+/* 000111001 - lower unsigned 8-by-16-bit partitioned product  */
+#define FMULD8ULx16_OPF	0x039
+
+/* 000101000 - four 16-bit compare; set rd if src1 > src2  */
+#define FCMPGT16_OPF	0x028
+
+/* 000101100 - two 32-bit compare; set rd if src1 > src2  */
+#define FCMPGT32_OPF	0x02c
+
+/* 000100000 - four 16-bit compare; set rd if src1 <= src2  */
+#define FCMPLE16_OPF	0x020
+
+/* 000100100 - two 32-bit compare; set rd if src1 <= src2  */
+#define FCMPLE32_OPF	0x024
+
+/* 000100010 - four 16-bit compare; set rd if src1 != src2  */
+#define FCMPNE16_OPF	0x022
+
+/* 000100110 - two 32-bit compare; set rd if src1 != src2  */
+#define FCMPNE32_OPF	0x026
+
+/* 000101010 - four 16-bit compare; set rd if src1 == src2  */
+#define FCMPEQ16_OPF	0x02a
+
+/* 000101110 - two 32-bit compare; set rd if src1 == src2  */
+#define FCMPEQ32_OPF	0x02e
+
+/* 000000000 - Eight 8-bit edge boundary processing  */
+#define EDGE8_OPF	0x000
+
+/* 000000001 - Eight 8-bit edge boundary processing, no CC */
+#define EDGE8N_OPF	0x001
+
+/* 000000010 - Eight 8-bit edge boundary processing, little-endian  */
+#define EDGE8L_OPF	0x002
+
+/* 000000011 - Eight 8-bit edge boundary processing, little-endian, no CC  */
+#define EDGE8LN_OPF	0x003
+
+/* 000000100 - Four 16-bit edge boundary processing  */
+#define EDGE16_OPF	0x004
+
+/* 000000101 - Four 16-bit edge boundary processing, no CC  */
+#define EDGE16N_OPF	0x005
+
+/* 000000110 - Four 16-bit edge boundary processing, little-endian  */
+#define EDGE16L_OPF	0x006
+
+/* 000000111 - Four 16-bit edge boundary processing, little-endian, no CC  */
+#define EDGE16LN_OPF	0x007
+
+/* 000001000 - Two 32-bit edge boundary processing  */
+#define EDGE32_OPF	0x008
+
+/* 000001001 - Two 32-bit edge boundary processing, no CC  */
+#define EDGE32N_OPF	0x009
+
+/* 000001010 - Two 32-bit edge boundary processing, little-endian  */
+#define EDGE32L_OPF	0x00a
+
+/* 000001011 - Two 32-bit edge boundary processing, little-endian, no CC  */
+#define EDGE32LN_OPF	0x00b
+
+/* 000111110 - distance between 8 8-bit components  */
+#define PDIST_OPF	0x03e
+
+/* 000010000 - convert 8-bit 3-D address to blocked byte address  */
+#define ARRAY8_OPF	0x010
+
+/* 000010010 - convert 16-bit 3-D address to blocked byte address  */
+#define ARRAY16_OPF	0x012
+
+/* 000010100 - convert 32-bit 3-D address to blocked byte address  */
+#define ARRAY32_OPF	0x014
+
+/* 000011001 - Set the GSR.MASK field in preparation for a BSHUFFLE  */
+#define BMASK_OPF	0x019
+
+/* 001001100 - Permute bytes as specified by GSR.MASK  */
+#define BSHUFFLE_OPF	0x04c
+
+#define VIS_OPCODE_MASK	((0x3 << 30) | (0x3f << 19))
+#define VIS_OPCODE_VAL	((0x2 << 30) | (0x36 << 19))
+
+#define VIS_OPF_SHIFT	5
+#define VIS_OPF_MASK	(0x1ff << VIS_OPF_SHIFT)
+
+#define RS1(INSN)	(((INSN) >> 24) & 0x1f)
+#define RS2(INSN)	(((INSN) >>  0) & 0x1f)
+#define RD(INSN)	(((INSN) >> 25) & 0x1f)
+
+static inline void maybe_flush_windows(unsigned int rs1, unsigned int rs2,
+				       unsigned int rd, int from_kernel)
+{
+	if (rs2 >= 16 || rs1 >= 16 || rd >= 16) {
+		if (from_kernel != 0)
+			__asm__ __volatile__("flushw");
+		else
+			flushw_user();
+	}
+}
+
+static unsigned long fetch_reg(unsigned int reg, struct pt_regs *regs)
+{
+	unsigned long value;
+	
+	if (reg < 16)
+		return (!reg ? 0 : regs->u_regs[reg]);
+	if (regs->tstate & TSTATE_PRIV) {
+		struct reg_window *win;
+		win = (struct reg_window *)(regs->u_regs[UREG_FP] + STACK_BIAS);
+		value = win->locals[reg - 16];
+	} else if (test_thread_flag(TIF_32BIT)) {
+		struct reg_window32 __user *win32;
+		win32 = (struct reg_window32 __user *)((unsigned long)((u32)regs->u_regs[UREG_FP]));
+		get_user(value, &win32->locals[reg - 16]);
+	} else {
+		struct reg_window __user *win;
+		win = (struct reg_window __user *)(regs->u_regs[UREG_FP] + STACK_BIAS);
+		get_user(value, &win->locals[reg - 16]);
+	}
+	return value;
+}
+
+static inline unsigned long __user *__fetch_reg_addr_user(unsigned int reg,
+							  struct pt_regs *regs)
+{
+	BUG_ON(reg < 16);
+	BUG_ON(regs->tstate & TSTATE_PRIV);
+
+	if (test_thread_flag(TIF_32BIT)) {
+		struct reg_window32 __user *win32;
+		win32 = (struct reg_window32 __user *)((unsigned long)((u32)regs->u_regs[UREG_FP]));
+		return (unsigned long __user *)&win32->locals[reg - 16];
+	} else {
+		struct reg_window __user *win;
+		win = (struct reg_window __user *)(regs->u_regs[UREG_FP] + STACK_BIAS);
+		return &win->locals[reg - 16];
+	}
+}
+
+static inline unsigned long *__fetch_reg_addr_kern(unsigned int reg,
+						   struct pt_regs *regs)
+{
+	BUG_ON(reg >= 16);
+	BUG_ON(regs->tstate & TSTATE_PRIV);
+
+	return &regs->u_regs[reg];
+}
+
+static void store_reg(struct pt_regs *regs, unsigned long val, unsigned long rd)
+{
+	if (rd < 16) {
+		unsigned long *rd_kern = __fetch_reg_addr_kern(rd, regs);
+
+		*rd_kern = val;
+	} else {
+		unsigned long __user *rd_user = __fetch_reg_addr_user(rd, regs);
+
+		if (test_thread_flag(TIF_32BIT))
+			__put_user((u32)val, (u32 __user *)rd_user);
+		else
+			__put_user(val, rd_user);
+	}
+}
+
+static inline unsigned long fpd_regval(struct fpustate *f,
+				       unsigned int insn_regnum)
+{
+	insn_regnum = (((insn_regnum & 1) << 5) |
+		       (insn_regnum & 0x1e));
+
+	return *(unsigned long *) &f->regs[insn_regnum];
+}
+
+static inline unsigned long *fpd_regaddr(struct fpustate *f,
+					 unsigned int insn_regnum)
+{
+	insn_regnum = (((insn_regnum & 1) << 5) |
+		       (insn_regnum & 0x1e));
+
+	return (unsigned long *) &f->regs[insn_regnum];
+}
+
+static inline unsigned int fps_regval(struct fpustate *f,
+				      unsigned int insn_regnum)
+{
+	return f->regs[insn_regnum];
+}
+
+static inline unsigned int *fps_regaddr(struct fpustate *f,
+					unsigned int insn_regnum)
+{
+	return &f->regs[insn_regnum];
+}
+
+struct edge_tab {
+	u16 left, right;
+};
+struct edge_tab edge8_tab[8] = {
+	{ 0xff, 0x80 },
+	{ 0x7f, 0xc0 },
+	{ 0x3f, 0xe0 },
+	{ 0x1f, 0xf0 },
+	{ 0x0f, 0xf8 },
+	{ 0x07, 0xfc },
+	{ 0x03, 0xfe },
+	{ 0x01, 0xff },
+};
+struct edge_tab edge8_tab_l[8] = {
+	{ 0xff, 0x01 },
+	{ 0xfe, 0x03 },
+	{ 0xfc, 0x07 },
+	{ 0xf8, 0x0f },
+	{ 0xf0, 0x1f },
+	{ 0xe0, 0x3f },
+	{ 0xc0, 0x7f },
+	{ 0x80, 0xff },
+};
+struct edge_tab edge16_tab[4] = {
+	{ 0xf, 0x8 },
+	{ 0x7, 0xc },
+	{ 0x3, 0xe },
+	{ 0x1, 0xf },
+};
+struct edge_tab edge16_tab_l[4] = {
+	{ 0xf, 0x1 },
+	{ 0xe, 0x3 },
+	{ 0xc, 0x7 },
+	{ 0x8, 0xf },
+};
+struct edge_tab edge32_tab[2] = {
+	{ 0x3, 0x2 },
+	{ 0x1, 0x3 },
+};
+struct edge_tab edge32_tab_l[2] = {
+	{ 0x3, 0x1 },
+	{ 0x2, 0x3 },
+};
+
+static void edge(struct pt_regs *regs, unsigned int insn, unsigned int opf)
+{
+	unsigned long orig_rs1, rs1, orig_rs2, rs2, rd_val;
+	u16 left, right;
+
+	maybe_flush_windows(RS1(insn), RS2(insn), RD(insn), 0);
+	orig_rs1 = rs1 = fetch_reg(RS1(insn), regs);
+	orig_rs2 = rs2 = fetch_reg(RS2(insn), regs);
+
+	if (test_thread_flag(TIF_32BIT)) {
+		rs1 = rs1 & 0xffffffff;
+		rs2 = rs2 & 0xffffffff;
+	}
+	switch (opf) {
+	default:
+	case EDGE8_OPF:
+	case EDGE8N_OPF:
+		left = edge8_tab[rs1 & 0x7].left;
+		right = edge8_tab[rs2 & 0x7].right;
+		break;
+	case EDGE8L_OPF:
+	case EDGE8LN_OPF:
+		left = edge8_tab_l[rs1 & 0x7].left;
+		right = edge8_tab_l[rs2 & 0x7].right;
+		break;
+
+	case EDGE16_OPF:
+	case EDGE16N_OPF:
+		left = edge16_tab[(rs1 >> 1) & 0x3].left;
+		right = edge16_tab[(rs2 >> 1) & 0x3].right;
+		break;
+
+	case EDGE16L_OPF:
+	case EDGE16LN_OPF:
+		left = edge16_tab_l[(rs1 >> 1) & 0x3].left;
+		right = edge16_tab_l[(rs2 >> 1) & 0x3].right;
+		break;
+
+	case EDGE32_OPF:
+	case EDGE32N_OPF:
+		left = edge32_tab[(rs1 >> 2) & 0x1].left;
+		right = edge32_tab[(rs2 >> 2) & 0x1].right;
+		break;
+
+	case EDGE32L_OPF:
+	case EDGE32LN_OPF:
+		left = edge32_tab_l[(rs1 >> 2) & 0x1].left;
+		right = edge32_tab_l[(rs2 >> 2) & 0x1].right;
+		break;
+	};
+
+	if ((rs1 & ~0x7UL) == (rs2 & ~0x7UL))
+		rd_val = right & left;
+	else
+		rd_val = left;
+
+	store_reg(regs, rd_val, RD(insn));
+
+	switch (opf) {
+	case EDGE8_OPF:
+	case EDGE8L_OPF:
+	case EDGE16_OPF:
+	case EDGE16L_OPF:
+	case EDGE32_OPF:
+	case EDGE32L_OPF: {
+		unsigned long ccr, tstate;
+
+		__asm__ __volatile__("subcc	%1, %2, %%g0\n\t"
+				     "rd	%%ccr, %0"
+				     : "=r" (ccr)
+				     : "r" (orig_rs1), "r" (orig_rs2)
+				     : "cc");
+		tstate = regs->tstate & ~(TSTATE_XCC | TSTATE_ICC);
+		regs->tstate = tstate | (ccr << 32UL);
+	}
+	};
+}
+
+static void array(struct pt_regs *regs, unsigned int insn, unsigned int opf)
+{
+	unsigned long rs1, rs2, rd_val;
+	unsigned int bits, bits_mask;
+
+	maybe_flush_windows(RS1(insn), RS2(insn), RD(insn), 0);
+	rs1 = fetch_reg(RS1(insn), regs);
+	rs2 = fetch_reg(RS2(insn), regs);
+
+	bits = (rs2 > 5 ? 5 : rs2);
+	bits_mask = (1UL << bits) - 1UL;
+
+	rd_val = ((((rs1 >> 11) & 0x3) <<  0) |
+		  (((rs1 >> 33) & 0x3) <<  2) |
+		  (((rs1 >> 55) & 0x1) <<  4) |
+		  (((rs1 >> 13) & 0xf) <<  5) |
+		  (((rs1 >> 35) & 0xf) <<  9) |
+		  (((rs1 >> 56) & 0xf) << 13) |
+		  (((rs1 >> 17) & bits_mask) << 17) |
+		  (((rs1 >> 39) & bits_mask) << (17 + bits)) |
+		  (((rs1 >> 60) & 0xf)       << (17 + (2*bits))));
+
+	switch (opf) {
+	case ARRAY16_OPF:
+		rd_val <<= 1;
+		break;
+
+	case ARRAY32_OPF:
+		rd_val <<= 2;
+	};
+
+	store_reg(regs, rd_val, RD(insn));
+}
+
+static void bmask(struct pt_regs *regs, unsigned int insn)
+{
+	unsigned long rs1, rs2, rd_val, gsr;
+
+	maybe_flush_windows(RS1(insn), RS2(insn), RD(insn), 0);
+	rs1 = fetch_reg(RS1(insn), regs);
+	rs2 = fetch_reg(RS2(insn), regs);
+	rd_val = rs1 + rs2;
+
+	store_reg(regs, rd_val, RD(insn));
+
+	gsr = current_thread_info()->gsr[0] & 0xffffffff;
+	gsr |= rd_val << 32UL;
+	current_thread_info()->gsr[0] = gsr;
+}
+
+static void bshuffle(struct pt_regs *regs, unsigned int insn)
+{
+	struct fpustate *f = FPUSTATE;
+	unsigned long rs1, rs2, rd_val;
+	unsigned long bmask, i;
+
+	bmask = current_thread_info()->gsr[0] >> 32UL;
+
+	rs1 = fpd_regval(f, RS1(insn));
+	rs2 = fpd_regval(f, RS2(insn));
+
+	rd_val = 0UL;
+	for (i = 0; i < 8; i++) {
+		unsigned long which = (bmask >> (i * 4)) & 0xf;
+		unsigned long byte;
+
+		if (which < 8)
+			byte = (rs1 >> (which * 8)) & 0xff;
+		else
+			byte = (rs2 >> ((which-8)*8)) & 0xff;
+		rd_val |= (byte << (i * 8));
+	}
+
+	*fpd_regaddr(f, RD(insn)) = rd_val;
+}
+
+static void pdist(struct pt_regs *regs, unsigned int insn)
+{
+	struct fpustate *f = FPUSTATE;
+	unsigned long rs1, rs2, *rd, rd_val;
+	unsigned long i;
+
+	rs1 = fpd_regval(f, RS1(insn));
+	rs2 = fpd_regval(f, RS1(insn));
+	rd = fpd_regaddr(f, RD(insn));
+
+	rd_val = *rd;
+
+	for (i = 0; i < 8; i++) {
+		s16 s1, s2;
+
+		s1 = (rs1 >> (56 - (i * 8))) & 0xff;
+		s2 = (rs2 >> (56 - (i * 8))) & 0xff;
+
+		/* Absolute value of difference. */
+		s1 -= s2;
+		if (s1 < 0)
+			s1 = ~s1 + 1;
+
+		rd_val += s1;
+	}
+
+	*rd = rd_val;
+}
+
+static void pformat(struct pt_regs *regs, unsigned int insn, unsigned int opf)
+{
+	struct fpustate *f = FPUSTATE;
+	unsigned long rs1, rs2, gsr, scale, rd_val;
+
+	gsr = current_thread_info()->gsr[0];
+	scale = (gsr >> 3) & (opf == FPACK16_OPF ? 0xf : 0x1f);
+	switch (opf) {
+	case FPACK16_OPF: {
+		unsigned long byte;
+
+		rs2 = fpd_regval(f, RS2(insn));
+		rd_val = 0;
+		for (byte = 0; byte < 4; byte++) {
+			unsigned int val;
+			s16 src = (rs2 >> (byte * 16UL)) & 0xffffUL;
+			int scaled = src << scale;
+			int from_fixed = scaled >> 7;
+
+			val = ((from_fixed < 0) ?
+			       0 :
+			       (from_fixed > 255) ?
+			       255 : from_fixed);
+
+			rd_val |= (val << (8 * byte));
+		}
+		*fps_regaddr(f, RD(insn)) = rd_val;
+		break;
+	}
+
+	case FPACK32_OPF: {
+		unsigned long word;
+
+		rs1 = fpd_regval(f, RS1(insn));
+		rs2 = fpd_regval(f, RS2(insn));
+		rd_val = (rs1 << 8) & ~(0x000000ff000000ffUL);
+		for (word = 0; word < 2; word++) {
+			unsigned long val;
+			s32 src = (rs2 >> (word * 32UL));
+			s64 scaled = src << scale;
+			s64 from_fixed = scaled >> 23;
+
+			val = ((from_fixed < 0) ?
+			       0 :
+			       (from_fixed > 255) ?
+			       255 : from_fixed);
+
+			rd_val |= (val << (32 * word));
+		}
+		*fpd_regaddr(f, RD(insn)) = rd_val;
+		break;
+	}
+
+	case FPACKFIX_OPF: {
+		unsigned long word;
+
+		rs2 = fpd_regval(f, RS2(insn));
+
+		rd_val = 0;
+		for (word = 0; word < 2; word++) {
+			long val;
+			s32 src = (rs2 >> (word * 32UL));
+			s64 scaled = src << scale;
+			s64 from_fixed = scaled >> 16;
+
+			val = ((from_fixed < -32768) ?
+			       -32768 :
+			       (from_fixed > 32767) ?
+			       32767 : from_fixed);
+
+			rd_val |= ((val & 0xffff) << (word * 16));
+		}
+		*fps_regaddr(f, RD(insn)) = rd_val;
+		break;
+	}
+
+	case FEXPAND_OPF: {
+		unsigned long byte;
+
+		rs2 = fps_regval(f, RS2(insn));
+
+		rd_val = 0;
+		for (byte = 0; byte < 4; byte++) {
+			unsigned long val;
+			u8 src = (rs2 >> (byte * 8)) & 0xff;
+
+			val = src << 4;
+
+			rd_val |= (val << (byte * 16));
+		}
+		*fpd_regaddr(f, RD(insn)) = rd_val;
+		break;
+	}
+
+	case FPMERGE_OPF: {
+		rs1 = fps_regval(f, RS1(insn));
+		rs2 = fps_regval(f, RS2(insn));
+
+		rd_val = (((rs2 & 0x000000ff) <<  0) |
+			  ((rs1 & 0x000000ff) <<  8) |
+			  ((rs2 & 0x0000ff00) <<  8) |
+			  ((rs1 & 0x0000ff00) << 16) |
+			  ((rs2 & 0x00ff0000) << 16) |
+			  ((rs1 & 0x00ff0000) << 24) |
+			  ((rs2 & 0xff000000) << 24) |
+			  ((rs1 & 0xff000000) << 32));
+		*fpd_regaddr(f, RD(insn)) = rd_val;
+		break;
+	}
+	};
+}
+
+static void pmul(struct pt_regs *regs, unsigned int insn, unsigned int opf)
+{
+	struct fpustate *f = FPUSTATE;
+	unsigned long rs1, rs2, rd_val;
+
+	switch (opf) {
+	case FMUL8x16_OPF: {
+		unsigned long byte;
+
+		rs1 = fps_regval(f, RS1(insn));
+		rs2 = fpd_regval(f, RS2(insn));
+
+		rd_val = 0;
+		for (byte = 0; byte < 4; byte++) {
+			u16 src1 = (rs1 >> (byte *  8)) & 0x00ff;
+			s16 src2 = (rs2 >> (byte * 16)) & 0xffff;
+			u32 prod = src1 * src2;
+			u16 scaled = ((prod & 0x00ffff00) >> 8);
+
+			/* Round up.  */
+			if (prod & 0x80)
+				scaled++;
+			rd_val |= ((scaled & 0xffffUL) << (byte * 16UL));
+		}
+
+		*fpd_regaddr(f, RD(insn)) = rd_val;
+		break;
+	}
+
+	case FMUL8x16AU_OPF:
+	case FMUL8x16AL_OPF: {
+		unsigned long byte;
+		s16 src2;
+
+		rs1 = fps_regval(f, RS1(insn));
+		rs2 = fps_regval(f, RS2(insn));
+
+		rd_val = 0;
+		src2 = (rs2 >> (opf == FMUL8x16AU_OPF) ? 16 : 0);
+		for (byte = 0; byte < 4; byte++) {
+			u16 src1 = (rs1 >> (byte * 8)) & 0x00ff;
+			u32 prod = src1 * src2;
+			u16 scaled = ((prod & 0x00ffff00) >> 8);
+
+			/* Round up.  */
+			if (prod & 0x80)
+				scaled++;
+			rd_val |= ((scaled & 0xffffUL) << (byte * 16UL));
+		}
+
+		*fpd_regaddr(f, RD(insn)) = rd_val;
+		break;
+	}
+
+	case FMUL8SUx16_OPF:
+	case FMUL8ULx16_OPF: {
+		unsigned long byte, ushift;
+
+		rs1 = fpd_regval(f, RS1(insn));
+		rs2 = fpd_regval(f, RS2(insn));
+
+		rd_val = 0;
+		ushift = (opf == FMUL8SUx16_OPF) ? 8 : 0;
+		for (byte = 0; byte < 4; byte++) {
+			u16 src1;
+			s16 src2;
+			u32 prod;
+			u16 scaled;
+
+			src1 = ((rs1 >> ((16 * byte) + ushift)) & 0x00ff);
+			src2 = ((rs2 >> (16 * byte)) & 0xffff);
+			prod = src1 * src2;
+			scaled = ((prod & 0x00ffff00) >> 8);
+
+			/* Round up.  */
+			if (prod & 0x80)
+				scaled++;
+			rd_val |= ((scaled & 0xffffUL) << (byte * 16UL));
+		}
+
+		*fpd_regaddr(f, RD(insn)) = rd_val;
+		break;
+	}
+
+	case FMULD8SUx16_OPF:
+	case FMULD8ULx16_OPF: {
+		unsigned long byte, ushift;
+
+		rs1 = fps_regval(f, RS1(insn));
+		rs2 = fps_regval(f, RS2(insn));
+
+		rd_val = 0;
+		ushift = (opf == FMULD8SUx16_OPF) ? 8 : 0;
+		for (byte = 0; byte < 2; byte++) {
+			u16 src1;
+			s16 src2;
+			u32 prod;
+			u16 scaled;
+
+			src1 = ((rs1 >> ((16 * byte) + ushift)) & 0x00ff);
+			src2 = ((rs2 >> (16 * byte)) & 0xffff);
+			prod = src1 * src2;
+			scaled = ((prod & 0x00ffff00) >> 8);
+
+			/* Round up.  */
+			if (prod & 0x80)
+				scaled++;
+			rd_val |= ((scaled & 0xffffUL) <<
+				   ((byte * 32UL) + 7UL));
+		}
+		*fpd_regaddr(f, RD(insn)) = rd_val;
+		break;
+	}
+	};
+}
+
+static void pcmp(struct pt_regs *regs, unsigned int insn, unsigned int opf)
+{
+	struct fpustate *f = FPUSTATE;
+	unsigned long rs1, rs2, rd_val, i;
+
+	rs1 = fpd_regval(f, RS1(insn));
+	rs2 = fpd_regval(f, RS2(insn));
+
+	rd_val = 0;
+
+	switch (opf) {
+	case FCMPGT16_OPF:
+		for (i = 0; i < 4; i++) {
+			s16 a = (rs1 >> (i * 16)) & 0xffff;
+			s16 b = (rs2 >> (i * 16)) & 0xffff;
+
+			if (a > b)
+				rd_val |= 1 << i;
+		}
+		break;
+
+	case FCMPGT32_OPF:
+		for (i = 0; i < 2; i++) {
+			s32 a = (rs1 >> (i * 32)) & 0xffff;
+			s32 b = (rs2 >> (i * 32)) & 0xffff;
+
+			if (a > b)
+				rd_val |= 1 << i;
+		}
+		break;
+
+	case FCMPLE16_OPF:
+		for (i = 0; i < 4; i++) {
+			s16 a = (rs1 >> (i * 16)) & 0xffff;
+			s16 b = (rs2 >> (i * 16)) & 0xffff;
+
+			if (a <= b)
+				rd_val |= 1 << i;
+		}
+		break;
+
+	case FCMPLE32_OPF:
+		for (i = 0; i < 2; i++) {
+			s32 a = (rs1 >> (i * 32)) & 0xffff;
+			s32 b = (rs2 >> (i * 32)) & 0xffff;
+
+			if (a <= b)
+				rd_val |= 1 << i;
+		}
+		break;
+
+	case FCMPNE16_OPF:
+		for (i = 0; i < 4; i++) {
+			s16 a = (rs1 >> (i * 16)) & 0xffff;
+			s16 b = (rs2 >> (i * 16)) & 0xffff;
+
+			if (a != b)
+				rd_val |= 1 << i;
+		}
+		break;
+
+	case FCMPNE32_OPF:
+		for (i = 0; i < 2; i++) {
+			s32 a = (rs1 >> (i * 32)) & 0xffff;
+			s32 b = (rs2 >> (i * 32)) & 0xffff;
+
+			if (a != b)
+				rd_val |= 1 << i;
+		}
+		break;
+
+	case FCMPEQ16_OPF:
+		for (i = 0; i < 4; i++) {
+			s16 a = (rs1 >> (i * 16)) & 0xffff;
+			s16 b = (rs2 >> (i * 16)) & 0xffff;
+
+			if (a == b)
+				rd_val |= 1 << i;
+		}
+		break;
+
+	case FCMPEQ32_OPF:
+		for (i = 0; i < 2; i++) {
+			s32 a = (rs1 >> (i * 32)) & 0xffff;
+			s32 b = (rs2 >> (i * 32)) & 0xffff;
+
+			if (a == b)
+				rd_val |= 1 << i;
+		}
+		break;
+	};
+
+	maybe_flush_windows(0, 0, RD(insn), 0);
+	store_reg(regs, rd_val, RD(insn));
+}
+
+/* Emulate the VIS instructions which are not implemented in
+ * hardware on Niagara.
+ */
+int vis_emul(struct pt_regs *regs, unsigned int insn)
+{
+	unsigned long pc = regs->tpc;
+	unsigned int opf;
+
+	BUG_ON(regs->tstate & TSTATE_PRIV);
+
+	if (test_thread_flag(TIF_32BIT))
+		pc = (u32)pc;
+
+	if (get_user(insn, (u32 __user *) pc))
+		return -EFAULT;
+
+	if ((insn & VIS_OPCODE_MASK) != VIS_OPCODE_VAL)
+		return -EINVAL;
+
+	opf = (insn & VIS_OPF_MASK) >> VIS_OPF_SHIFT;
+	switch (opf) {
+	default:
+		return -EINVAL;
+
+	/* Pixel Formatting Instructions.  */
+	case FPACK16_OPF:
+	case FPACK32_OPF:
+	case FPACKFIX_OPF:
+	case FEXPAND_OPF:
+	case FPMERGE_OPF:
+		pformat(regs, insn, opf);
+		break;
+
+	/* Partitioned Multiply Instructions  */
+	case FMUL8x16_OPF:
+	case FMUL8x16AU_OPF:
+	case FMUL8x16AL_OPF:
+	case FMUL8SUx16_OPF:
+	case FMUL8ULx16_OPF:
+	case FMULD8SUx16_OPF:
+	case FMULD8ULx16_OPF:
+		pmul(regs, insn, opf);
+		break;
+
+	/* Pixel Compare Instructions  */
+	case FCMPGT16_OPF:
+	case FCMPGT32_OPF:
+	case FCMPLE16_OPF:
+	case FCMPLE32_OPF:
+	case FCMPNE16_OPF:
+	case FCMPNE32_OPF:
+	case FCMPEQ16_OPF:
+	case FCMPEQ32_OPF:
+		pcmp(regs, insn, opf);
+		break;
+
+	/* Edge Handling Instructions  */
+	case EDGE8_OPF:
+	case EDGE8N_OPF:
+	case EDGE8L_OPF:
+	case EDGE8LN_OPF:
+	case EDGE16_OPF:
+	case EDGE16N_OPF:
+	case EDGE16L_OPF:
+	case EDGE16LN_OPF:
+	case EDGE32_OPF:
+	case EDGE32N_OPF:
+	case EDGE32L_OPF:
+	case EDGE32LN_OPF:
+		edge(regs, insn, opf);
+		break;
+
+	/* Pixel Component Distance  */
+	case PDIST_OPF:
+		pdist(regs, insn);
+		break;
+
+	/* Three-Dimensional Array Addressing Instructions  */
+	case ARRAY8_OPF:
+	case ARRAY16_OPF:
+	case ARRAY32_OPF:
+		array(regs, insn, opf);
+		break;
+
+	/* Byte Mask and Shuffle Instructions  */
+	case BMASK_OPF:
+		bmask(regs, insn);
+		break;
+
+	case BSHUFFLE_OPF:
+		bshuffle(regs, insn);
+		break;
+	};
+
+	regs->tpc = regs->tnpc;
+	regs->tnpc += 4;
+	return 0;
+}

+ 16 - 0
arch/sparc64/kernel/vmlinux.lds.S

@@ -70,6 +70,22 @@ SECTIONS
   .con_initcall.init : { *(.con_initcall.init) }
   .con_initcall.init : { *(.con_initcall.init) }
   __con_initcall_end = .;
   __con_initcall_end = .;
   SECURITY_INIT
   SECURITY_INIT
+  . = ALIGN(4);
+  __tsb_ldquad_phys_patch = .;
+  .tsb_ldquad_phys_patch : { *(.tsb_ldquad_phys_patch) }
+  __tsb_ldquad_phys_patch_end = .;
+  __tsb_phys_patch = .;
+  .tsb_phys_patch : { *(.tsb_phys_patch) }
+  __tsb_phys_patch_end = .;
+  __cpuid_patch = .;
+  .cpuid_patch : { *(.cpuid_patch) }
+  __cpuid_patch_end = .;
+  __sun4v_1insn_patch = .;
+  .sun4v_1insn_patch : { *(.sun4v_1insn_patch) }
+  __sun4v_1insn_patch_end = .;
+  __sun4v_2insn_patch = .;
+  .sun4v_2insn_patch : { *(.sun4v_2insn_patch) }
+  __sun4v_2insn_patch_end = .;
   . = ALIGN(8192); 
   . = ALIGN(8192); 
   __initramfs_start = .;
   __initramfs_start = .;
   .init.ramfs : { *(.init.ramfs) }
   .init.ramfs : { *(.init.ramfs) }

+ 124 - 356
arch/sparc64/kernel/winfixup.S

@@ -1,8 +1,6 @@
-/* $Id: winfixup.S,v 1.30 2002/02/09 19:49:30 davem Exp $
+/* winfixup.S: Handle cases where user stack pointer is found to be bogus.
  *
  *
- * winfixup.S: Handle cases where user stack pointer is found to be bogus.
- *
- * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
+ * Copyright (C) 1997, 2006 David S. Miller (davem@davemloft.net)
  */
  */
 
 
 #include <asm/asi.h>
 #include <asm/asi.h>
@@ -15,374 +13,144 @@
 
 
 	.text
 	.text
 
 
-set_pcontext:
-	sethi	%hi(sparc64_kern_pri_context), %l1
-	ldx	[%l1 + %lo(sparc64_kern_pri_context)], %l1
-	mov	PRIMARY_CONTEXT, %g1
-	stxa	%l1, [%g1] ASI_DMMU
-	flush	%g6
-	retl
-	 nop
+	/* It used to be the case that these register window fault
+	 * handlers could run via the save and restore instructions
+	 * done by the trap entry and exit code.  They now do the
+	 * window spill/fill by hand, so that case no longer can occur.
+	 */
 
 
 	.align	32
 	.align	32
-
-	/* Here are the rules, pay attention.
-	 *
-	 * The kernel is disallowed from touching user space while
-	 * the trap level is greater than zero, except for from within
-	 * the window spill/fill handlers.  This must be followed
-	 * so that we can easily detect the case where we tried to
-	 * spill/fill with a bogus (or unmapped) user stack pointer.
-	 *
-	 * These are layed out in a special way for cache reasons,
-	 * don't touch...
-	 */
-	.globl	fill_fixup, spill_fixup
 fill_fixup:
 fill_fixup:
-	rdpr		%tstate, %g1
-	andcc		%g1, TSTATE_PRIV, %g0
-	or		%g4, FAULT_CODE_WINFIXUP, %g4
-	be,pt		%xcc, window_scheisse_from_user_common
-	 and		%g1, TSTATE_CWP, %g1
-
-	/* This is the extremely complex case, but it does happen from
-	 * time to time if things are just right.  Essentially the restore
-	 * done in rtrap right before going back to user mode, with tl=1
-	 * and that levels trap stack registers all setup, took a fill trap,
-	 * the user stack was not mapped in the tlb, and tlb miss occurred,
-	 * the pte found was not valid, and a simple ref bit watch update
-	 * could not satisfy the miss, so we got here.
-	 *
-	 * We must carefully unwind the state so we get back to tl=0, preserve
-	 * all the register values we were going to give to the user.  Luckily
-	 * most things are where they need to be, we also have the address
-	 * which triggered the fault handy as well.
-	 *
-	 * Also note that we must preserve %l5 and %l6.  If the user was
-	 * returning from a system call, we must make it look this way
-	 * after we process the fill fault on the users stack.
-	 *
-	 * First, get into the window where the original restore was executed.
-	 */
-
-	rdpr		%wstate, %g2			! Grab user mode wstate.
-	wrpr		%g1, %cwp			! Get into the right window.
-	sll		%g2, 3, %g2			! NORMAL-->OTHER
-
-	wrpr		%g0, 0x0, %canrestore		! Standard etrap stuff.
-	wrpr		%g2, 0x0, %wstate		! This must be consistent.
-	wrpr		%g0, 0x0, %otherwin		! We know this.
-	call		set_pcontext			! Change contexts...
+	TRAP_LOAD_THREAD_REG(%g6, %g1)
+	rdpr	%tstate, %g1
+	and	%g1, TSTATE_CWP, %g1
+	or	%g4, FAULT_CODE_WINFIXUP, %g4
+	stb	%g4, [%g6 + TI_FAULT_CODE]
+	stx	%g5, [%g6 + TI_FAULT_ADDR]
+	wrpr	%g1, %cwp
+	ba,pt	%xcc, etrap
+	 rd	%pc, %g7
+	call	do_sparc64_fault
+	 add	%sp, PTREGS_OFF, %o0
+	ba,pt	%xcc, rtrap_clr_l6
 	 nop
 	 nop
-	rdpr		%pstate, %l1			! Prepare to change globals.
-	mov		%g6, %o7			! Get current.
-
-	andn		%l1, PSTATE_MM, %l1		! We want to be in RMO
-	stb		%g4, [%g6 + TI_FAULT_CODE]
-	stx		%g5, [%g6 + TI_FAULT_ADDR]
-	wrpr		%g0, 0x0, %tl			! Out of trap levels.
-	wrpr		%l1, (PSTATE_IE | PSTATE_AG | PSTATE_RMO), %pstate
-	mov		%o7, %g6
-	ldx		[%g6 + TI_TASK], %g4
-#ifdef CONFIG_SMP
-	mov		TSB_REG, %g1
-	ldxa		[%g1] ASI_IMMU, %g5
-#endif
 
 
-	/* This is the same as below, except we handle this a bit special
-	 * since we must preserve %l5 and %l6, see comment above.
-	 */
-	call		do_sparc64_fault
-	 add		%sp, PTREGS_OFF, %o0
-	ba,pt		%xcc, rtrap
-	 nop						! yes, nop is correct
-
-	/* Be very careful about usage of the alternate globals here.
-	 * You cannot touch %g4/%g5 as that has the fault information
-	 * should this be from usermode.  Also be careful for the case
-	 * where we get here from the save instruction in etrap.S when
-	 * coming from either user or kernel (does not matter which, it
-	 * is the same problem in both cases).  Essentially this means
-	 * do not touch %g7 or %g2 so we handle the two cases fine.
+	/* Be very careful about usage of the trap globals here.
+	 * You cannot touch %g5 as that has the fault information.
 	 */
 	 */
 spill_fixup:
 spill_fixup:
-	ldx		[%g6 + TI_FLAGS], %g1
-	andcc		%g1, _TIF_32BIT, %g0
-	ldub		[%g6 + TI_WSAVED], %g1
-
-	sll		%g1, 3, %g3
-	add		%g6, %g3, %g3
-	stx		%sp, [%g3 + TI_RWIN_SPTRS]
-	sll		%g1, 7, %g3
-	bne,pt		%xcc, 1f
-	 add		%g6, %g3, %g3
-	stx		%l0, [%g3 + TI_REG_WINDOW + 0x00]
-	stx		%l1, [%g3 + TI_REG_WINDOW + 0x08]
-
-	stx		%l2, [%g3 + TI_REG_WINDOW + 0x10]
-	stx		%l3, [%g3 + TI_REG_WINDOW + 0x18]
-	stx		%l4, [%g3 + TI_REG_WINDOW + 0x20]
-	stx		%l5, [%g3 + TI_REG_WINDOW + 0x28]
-	stx		%l6, [%g3 + TI_REG_WINDOW + 0x30]
-	stx		%l7, [%g3 + TI_REG_WINDOW + 0x38]
-	stx		%i0, [%g3 + TI_REG_WINDOW + 0x40]
-	stx		%i1, [%g3 + TI_REG_WINDOW + 0x48]
-
-	stx		%i2, [%g3 + TI_REG_WINDOW + 0x50]
-	stx		%i3, [%g3 + TI_REG_WINDOW + 0x58]
-	stx		%i4, [%g3 + TI_REG_WINDOW + 0x60]
-	stx		%i5, [%g3 + TI_REG_WINDOW + 0x68]
-	stx		%i6, [%g3 + TI_REG_WINDOW + 0x70]
-	b,pt		%xcc, 2f
-	 stx		%i7, [%g3 + TI_REG_WINDOW + 0x78]
-1:	stw		%l0, [%g3 + TI_REG_WINDOW + 0x00]
-
-	stw		%l1, [%g3 + TI_REG_WINDOW + 0x04]
-	stw		%l2, [%g3 + TI_REG_WINDOW + 0x08]
-	stw		%l3, [%g3 + TI_REG_WINDOW + 0x0c]
-	stw		%l4, [%g3 + TI_REG_WINDOW + 0x10]
-	stw		%l5, [%g3 + TI_REG_WINDOW + 0x14]
-	stw		%l6, [%g3 + TI_REG_WINDOW + 0x18]
-	stw		%l7, [%g3 + TI_REG_WINDOW + 0x1c]
-	stw		%i0, [%g3 + TI_REG_WINDOW + 0x20]
-
-	stw		%i1, [%g3 + TI_REG_WINDOW + 0x24]
-	stw		%i2, [%g3 + TI_REG_WINDOW + 0x28]
-	stw		%i3, [%g3 + TI_REG_WINDOW + 0x2c]
-	stw		%i4, [%g3 + TI_REG_WINDOW + 0x30]
-	stw		%i5, [%g3 + TI_REG_WINDOW + 0x34]
-	stw		%i6, [%g3 + TI_REG_WINDOW + 0x38]
-	stw		%i7, [%g3 + TI_REG_WINDOW + 0x3c]
-2:	add		%g1, 1, %g1
-
-	stb		%g1, [%g6 + TI_WSAVED]
-	rdpr		%tstate, %g1
-	andcc		%g1, TSTATE_PRIV, %g0
+spill_fixup_mna:
+spill_fixup_dax:
+	TRAP_LOAD_THREAD_REG(%g6, %g1)
+	ldx	[%g6 + TI_FLAGS], %g1
+	andcc	%g1, _TIF_32BIT, %g0
+	ldub	[%g6 + TI_WSAVED], %g1
+	sll	%g1, 3, %g3
+	add	%g6, %g3, %g3
+	stx	%sp, [%g3 + TI_RWIN_SPTRS]
+	sll	%g1, 7, %g3
+	bne,pt	%xcc, 1f
+	 add	%g6, %g3, %g3
+	stx	%l0, [%g3 + TI_REG_WINDOW + 0x00]
+	stx	%l1, [%g3 + TI_REG_WINDOW + 0x08]
+	stx	%l2, [%g3 + TI_REG_WINDOW + 0x10]
+	stx	%l3, [%g3 + TI_REG_WINDOW + 0x18]
+	stx	%l4, [%g3 + TI_REG_WINDOW + 0x20]
+	stx	%l5, [%g3 + TI_REG_WINDOW + 0x28]
+	stx	%l6, [%g3 + TI_REG_WINDOW + 0x30]
+	stx	%l7, [%g3 + TI_REG_WINDOW + 0x38]
+	stx	%i0, [%g3 + TI_REG_WINDOW + 0x40]
+	stx	%i1, [%g3 + TI_REG_WINDOW + 0x48]
+	stx	%i2, [%g3 + TI_REG_WINDOW + 0x50]
+	stx	%i3, [%g3 + TI_REG_WINDOW + 0x58]
+	stx	%i4, [%g3 + TI_REG_WINDOW + 0x60]
+	stx	%i5, [%g3 + TI_REG_WINDOW + 0x68]
+	stx	%i6, [%g3 + TI_REG_WINDOW + 0x70]
+	ba,pt	%xcc, 2f
+	 stx	%i7, [%g3 + TI_REG_WINDOW + 0x78]
+1:	stw	%l0, [%g3 + TI_REG_WINDOW + 0x00]
+	stw	%l1, [%g3 + TI_REG_WINDOW + 0x04]
+	stw	%l2, [%g3 + TI_REG_WINDOW + 0x08]
+	stw	%l3, [%g3 + TI_REG_WINDOW + 0x0c]
+	stw	%l4, [%g3 + TI_REG_WINDOW + 0x10]
+	stw	%l5, [%g3 + TI_REG_WINDOW + 0x14]
+	stw	%l6, [%g3 + TI_REG_WINDOW + 0x18]
+	stw	%l7, [%g3 + TI_REG_WINDOW + 0x1c]
+	stw	%i0, [%g3 + TI_REG_WINDOW + 0x20]
+	stw	%i1, [%g3 + TI_REG_WINDOW + 0x24]
+	stw	%i2, [%g3 + TI_REG_WINDOW + 0x28]
+	stw	%i3, [%g3 + TI_REG_WINDOW + 0x2c]
+	stw	%i4, [%g3 + TI_REG_WINDOW + 0x30]
+	stw	%i5, [%g3 + TI_REG_WINDOW + 0x34]
+	stw	%i6, [%g3 + TI_REG_WINDOW + 0x38]
+	stw	%i7, [%g3 + TI_REG_WINDOW + 0x3c]
+2:	add	%g1, 1, %g1
+	stb	%g1, [%g6 + TI_WSAVED]
+	rdpr	%tstate, %g1
+	andcc	%g1, TSTATE_PRIV, %g0
 	saved
 	saved
-	and		%g1, TSTATE_CWP, %g1
-	be,pn		%xcc, window_scheisse_from_user_common
-	 mov		FAULT_CODE_WRITE | FAULT_CODE_DTLB | FAULT_CODE_WINFIXUP, %g4
+	be,pn	%xcc, 1f
+	 and	%g1, TSTATE_CWP, %g1
 	retry
 	retry
+1:	mov	FAULT_CODE_WRITE | FAULT_CODE_DTLB | FAULT_CODE_WINFIXUP, %g4
+	stb	%g4, [%g6 + TI_FAULT_CODE]
+	stx	%g5, [%g6 + TI_FAULT_ADDR]
+	wrpr	%g1, %cwp
+	ba,pt	%xcc, etrap
+	 rd	%pc, %g7
+	call	do_sparc64_fault
+	 add	%sp, PTREGS_OFF, %o0
+	ba,a,pt	%xcc, rtrap_clr_l6
 
 
-window_scheisse_from_user_common:
-	stb		%g4, [%g6 + TI_FAULT_CODE]
-	stx		%g5, [%g6 + TI_FAULT_ADDR]
-	wrpr		%g1, %cwp
-	ba,pt		%xcc, etrap
-	 rd		%pc, %g7
-	call		do_sparc64_fault
-	 add		%sp, PTREGS_OFF, %o0
-	ba,a,pt		%xcc, rtrap_clr_l6
-
-	.globl		winfix_mna, fill_fixup_mna, spill_fixup_mna
 winfix_mna:
 winfix_mna:
-	andn		%g3, 0x7f, %g3
-	add		%g3, 0x78, %g3
-	wrpr		%g3, %tnpc
+	andn	%g3, 0x7f, %g3
+	add	%g3, 0x78, %g3
+	wrpr	%g3, %tnpc
 	done
 	done
-fill_fixup_mna:
-	rdpr		%tstate, %g1
-	andcc		%g1, TSTATE_PRIV, %g0
-	be,pt		%xcc, window_mna_from_user_common
-	 and		%g1, TSTATE_CWP, %g1
 
 
-	/* Please, see fill_fixup commentary about why we must preserve
-	 * %l5 and %l6 to preserve absolute correct semantics.
-	 */
-	rdpr		%wstate, %g2			! Grab user mode wstate.
-	wrpr		%g1, %cwp			! Get into the right window.
-	sll		%g2, 3, %g2			! NORMAL-->OTHER
-	wrpr		%g0, 0x0, %canrestore		! Standard etrap stuff.
-
-	wrpr		%g2, 0x0, %wstate		! This must be consistent.
-	wrpr		%g0, 0x0, %otherwin		! We know this.
-	call		set_pcontext			! Change contexts...
+fill_fixup_mna:
+	rdpr	%tstate, %g1
+	and	%g1, TSTATE_CWP, %g1
+	wrpr	%g1, %cwp
+	ba,pt	%xcc, etrap
+	 rd	%pc, %g7
+	sethi	%hi(tlb_type), %g1
+	lduw	[%g1 + %lo(tlb_type)], %g1
+	cmp	%g1, 3
+	bne,pt	%icc, 1f
+	 add	%sp, PTREGS_OFF, %o0
+	mov	%l4, %o2
+	call	sun4v_do_mna
+	 mov	%l5, %o1
+	ba,a,pt	%xcc, rtrap_clr_l6
+1:	mov	%l4, %o1
+	mov	%l5, %o2
+	call	mem_address_unaligned
 	 nop
 	 nop
-	rdpr		%pstate, %l1			! Prepare to change globals.
-	mov		%g4, %o2			! Setup args for
-	mov		%g5, %o1			! final call to mem_address_unaligned.
-	andn		%l1, PSTATE_MM, %l1		! We want to be in RMO
+	ba,a,pt	%xcc, rtrap_clr_l6
 
 
-	mov		%g6, %o7			! Stash away current.
-	wrpr		%g0, 0x0, %tl			! Out of trap levels.
-	wrpr		%l1, (PSTATE_IE | PSTATE_AG | PSTATE_RMO), %pstate
-	mov		%o7, %g6			! Get current back.
-	ldx		[%g6 + TI_TASK], %g4		! Finish it.
-#ifdef CONFIG_SMP
-	mov		TSB_REG, %g1
-	ldxa		[%g1] ASI_IMMU, %g5
-#endif
-	call		mem_address_unaligned
-	 add		%sp, PTREGS_OFF, %o0
-
-	b,pt		%xcc, rtrap
-	 nop						! yes, the nop is correct
-spill_fixup_mna:
-	ldx		[%g6 + TI_FLAGS], %g1
-	andcc		%g1, _TIF_32BIT, %g0
-	ldub		[%g6 + TI_WSAVED], %g1
-	sll		%g1, 3, %g3
-	add		%g6, %g3, %g3
-	stx		%sp, [%g3 + TI_RWIN_SPTRS]
-
-	sll		%g1, 7, %g3
-	bne,pt		%xcc, 1f
-	 add		%g6, %g3, %g3
-	stx		%l0, [%g3 + TI_REG_WINDOW + 0x00]
-	stx		%l1, [%g3 + TI_REG_WINDOW + 0x08]
-	stx		%l2, [%g3 + TI_REG_WINDOW + 0x10]
-	stx		%l3, [%g3 + TI_REG_WINDOW + 0x18]
-	stx		%l4, [%g3 + TI_REG_WINDOW + 0x20]
-
-	stx		%l5, [%g3 + TI_REG_WINDOW + 0x28]
-	stx		%l6, [%g3 + TI_REG_WINDOW + 0x30]
-	stx		%l7, [%g3 + TI_REG_WINDOW + 0x38]
-	stx		%i0, [%g3 + TI_REG_WINDOW + 0x40]
-	stx		%i1, [%g3 + TI_REG_WINDOW + 0x48]
-	stx		%i2, [%g3 + TI_REG_WINDOW + 0x50]
-	stx		%i3, [%g3 + TI_REG_WINDOW + 0x58]
-	stx		%i4, [%g3 + TI_REG_WINDOW + 0x60]
-
-	stx		%i5, [%g3 + TI_REG_WINDOW + 0x68]
-	stx		%i6, [%g3 + TI_REG_WINDOW + 0x70]
-	stx		%i7, [%g3 + TI_REG_WINDOW + 0x78]
-	b,pt		%xcc, 2f
-	 add		%g1, 1, %g1
-1:	std		%l0, [%g3 + TI_REG_WINDOW + 0x00]
-	std		%l2, [%g3 + TI_REG_WINDOW + 0x08]
-	std		%l4, [%g3 + TI_REG_WINDOW + 0x10]
-
-	std		%l6, [%g3 + TI_REG_WINDOW + 0x18]
-	std		%i0, [%g3 + TI_REG_WINDOW + 0x20]
-	std		%i2, [%g3 + TI_REG_WINDOW + 0x28]
-	std		%i4, [%g3 + TI_REG_WINDOW + 0x30]
-	std		%i6, [%g3 + TI_REG_WINDOW + 0x38]
-	add		%g1, 1, %g1
-2:	stb		%g1, [%g6 + TI_WSAVED]
-	rdpr		%tstate, %g1
-
-	andcc		%g1, TSTATE_PRIV, %g0
-	saved
-	be,pn		%xcc, window_mna_from_user_common
-	 and		%g1, TSTATE_CWP, %g1
-	retry
-window_mna_from_user_common:
-	wrpr		%g1, %cwp
-	sethi		%hi(109f), %g7
-	ba,pt		%xcc, etrap
-109:	 or		%g7, %lo(109b), %g7
-	mov		%l4, %o2
-	mov		%l5, %o1
-	call		mem_address_unaligned
-	 add		%sp, PTREGS_OFF, %o0
-	ba,pt		%xcc, rtrap
-	 clr		%l6
-	
-	/* These are only needed for 64-bit mode processes which
-	 * put their stack pointer into the VPTE area and there
-	 * happens to be a VPTE tlb entry mapped there during
-	 * a spill/fill trap to that stack frame.
-	 */
-	.globl		winfix_dax, fill_fixup_dax, spill_fixup_dax
 winfix_dax:
 winfix_dax:
-	andn		%g3, 0x7f, %g3
-	add		%g3, 0x74, %g3
-	wrpr		%g3, %tnpc
+	andn	%g3, 0x7f, %g3
+	add	%g3, 0x74, %g3
+	wrpr	%g3, %tnpc
 	done
 	done
-fill_fixup_dax:
-	rdpr		%tstate, %g1
-	andcc		%g1, TSTATE_PRIV, %g0
-	be,pt		%xcc, window_dax_from_user_common
-	 and		%g1, TSTATE_CWP, %g1
-
-	/* Please, see fill_fixup commentary about why we must preserve
-	 * %l5 and %l6 to preserve absolute correct semantics.
-	 */
-	rdpr		%wstate, %g2			! Grab user mode wstate.
-	wrpr		%g1, %cwp			! Get into the right window.
-	sll		%g2, 3, %g2			! NORMAL-->OTHER
-	wrpr		%g0, 0x0, %canrestore		! Standard etrap stuff.
 
 
-	wrpr		%g2, 0x0, %wstate		! This must be consistent.
-	wrpr		%g0, 0x0, %otherwin		! We know this.
-	call		set_pcontext			! Change contexts...
+fill_fixup_dax:
+	rdpr	%tstate, %g1
+	and	%g1, TSTATE_CWP, %g1
+	wrpr	%g1, %cwp
+	ba,pt	%xcc, etrap
+	 rd	%pc, %g7
+	sethi	%hi(tlb_type), %g1
+	mov	%l4, %o1
+	lduw	[%g1 + %lo(tlb_type)], %g1
+	mov	%l5, %o2
+	cmp	%g1, 3
+	bne,pt	%icc, 1f
+	 add	%sp, PTREGS_OFF, %o0
+	call	sun4v_data_access_exception
 	 nop
 	 nop
-	rdpr		%pstate, %l1			! Prepare to change globals.
-	mov		%g4, %o1			! Setup args for
-	mov		%g5, %o2			! final call to spitfire_data_access_exception.
-	andn		%l1, PSTATE_MM, %l1		! We want to be in RMO
-
-	mov		%g6, %o7			! Stash away current.
-	wrpr		%g0, 0x0, %tl			! Out of trap levels.
-	wrpr		%l1, (PSTATE_IE | PSTATE_AG | PSTATE_RMO), %pstate
-	mov		%o7, %g6			! Get current back.
-	ldx		[%g6 + TI_TASK], %g4		! Finish it.
-#ifdef CONFIG_SMP
-	mov		TSB_REG, %g1
-	ldxa		[%g1] ASI_IMMU, %g5
-#endif
-	call		spitfire_data_access_exception
-	 add		%sp, PTREGS_OFF, %o0
-
-	b,pt		%xcc, rtrap
-	 nop						! yes, the nop is correct
-spill_fixup_dax:
-	ldx		[%g6 + TI_FLAGS], %g1
-	andcc		%g1, _TIF_32BIT, %g0
-	ldub		[%g6 + TI_WSAVED], %g1
-	sll		%g1, 3, %g3
-	add		%g6, %g3, %g3
-	stx		%sp, [%g3 + TI_RWIN_SPTRS]
-
-	sll		%g1, 7, %g3
-	bne,pt		%xcc, 1f
-	 add		%g6, %g3, %g3
-	stx		%l0, [%g3 + TI_REG_WINDOW + 0x00]
-	stx		%l1, [%g3 + TI_REG_WINDOW + 0x08]
-	stx		%l2, [%g3 + TI_REG_WINDOW + 0x10]
-	stx		%l3, [%g3 + TI_REG_WINDOW + 0x18]
-	stx		%l4, [%g3 + TI_REG_WINDOW + 0x20]
-
-	stx		%l5, [%g3 + TI_REG_WINDOW + 0x28]
-	stx		%l6, [%g3 + TI_REG_WINDOW + 0x30]
-	stx		%l7, [%g3 + TI_REG_WINDOW + 0x38]
-	stx		%i0, [%g3 + TI_REG_WINDOW + 0x40]
-	stx		%i1, [%g3 + TI_REG_WINDOW + 0x48]
-	stx		%i2, [%g3 + TI_REG_WINDOW + 0x50]
-	stx		%i3, [%g3 + TI_REG_WINDOW + 0x58]
-	stx		%i4, [%g3 + TI_REG_WINDOW + 0x60]
-
-	stx		%i5, [%g3 + TI_REG_WINDOW + 0x68]
-	stx		%i6, [%g3 + TI_REG_WINDOW + 0x70]
-	stx		%i7, [%g3 + TI_REG_WINDOW + 0x78]
-	b,pt		%xcc, 2f
-	 add		%g1, 1, %g1
-1:	std		%l0, [%g3 + TI_REG_WINDOW + 0x00]
-	std		%l2, [%g3 + TI_REG_WINDOW + 0x08]
-	std		%l4, [%g3 + TI_REG_WINDOW + 0x10]
-
-	std		%l6, [%g3 + TI_REG_WINDOW + 0x18]
-	std		%i0, [%g3 + TI_REG_WINDOW + 0x20]
-	std		%i2, [%g3 + TI_REG_WINDOW + 0x28]
-	std		%i4, [%g3 + TI_REG_WINDOW + 0x30]
-	std		%i6, [%g3 + TI_REG_WINDOW + 0x38]
-	add		%g1, 1, %g1
-2:	stb		%g1, [%g6 + TI_WSAVED]
-	rdpr		%tstate, %g1
-
-	andcc		%g1, TSTATE_PRIV, %g0
-	saved
-	be,pn		%xcc, window_dax_from_user_common
-	 and		%g1, TSTATE_CWP, %g1
-	retry
-window_dax_from_user_common:
-	wrpr		%g1, %cwp
-	sethi		%hi(109f), %g7
-	ba,pt		%xcc, etrap
-109:	 or		%g7, %lo(109b), %g7
-	mov		%l4, %o1
-	mov		%l5, %o2
-	call		spitfire_data_access_exception
-	 add		%sp, PTREGS_OFF, %o0
-	ba,pt		%xcc, rtrap
-	 clr		%l6
+	ba,a,pt	%xcc, rtrap_clr_l6
+1:	call	spitfire_data_access_exception
+	 nop
+	ba,a,pt	%xcc, rtrap_clr_l6

+ 2 - 0
arch/sparc64/lib/Makefile

@@ -11,6 +11,8 @@ lib-y := PeeCeeI.o copy_page.o clear_page.o strlen.o strncmp.o \
 	 VISsave.o atomic.o bitops.o \
 	 VISsave.o atomic.o bitops.o \
 	 U1memcpy.o U1copy_from_user.o U1copy_to_user.o \
 	 U1memcpy.o U1copy_from_user.o U1copy_to_user.o \
 	 U3memcpy.o U3copy_from_user.o U3copy_to_user.o U3patch.o \
 	 U3memcpy.o U3copy_from_user.o U3copy_to_user.o U3patch.o \
+	 NGmemcpy.o NGcopy_from_user.o NGcopy_to_user.o NGpatch.o \
+	 NGpage.o NGbzero.o \
 	 copy_in_user.o user_fixup.o memmove.o \
 	 copy_in_user.o user_fixup.o memmove.o \
 	 mcount.o ipcsum.o rwsem.o xor.o find_bit.o delay.o
 	 mcount.o ipcsum.o rwsem.o xor.o find_bit.o delay.o
 
 

+ 163 - 0
arch/sparc64/lib/NGbzero.S

@@ -0,0 +1,163 @@
+/* NGbzero.S: Niagara optimized memset/clear_user.
+ *
+ * Copyright (C) 2006 David S. Miller (davem@davemloft.net)
+ */
+#include <asm/asi.h>
+
+#define EX_ST(x,y)		\
+98:	x,y;			\
+	.section .fixup;	\
+	.align 4;		\
+99:	retl;			\
+	 mov	%o1, %o0;	\
+	.section __ex_table;	\
+	.align 4;		\
+	.word 98b, 99b;		\
+	.text;			\
+	.align 4;
+
+	.text
+
+	.globl		NGmemset
+	.type		NGmemset, #function
+NGmemset:		/* %o0=buf, %o1=pat, %o2=len */
+	and		%o1, 0xff, %o3
+	mov		%o2, %o1
+	sllx		%o3, 8, %g1
+	or		%g1, %o3, %o2
+	sllx		%o2, 16, %g1
+	or		%g1, %o2, %o2
+	sllx		%o2, 32, %g1
+	ba,pt		%xcc, 1f
+	 or		%g1, %o2, %o2
+
+	.globl		NGbzero
+	.type		NGbzero, #function
+NGbzero:
+	clr		%o2
+1:	brz,pn		%o1, NGbzero_return
+	 mov		%o0, %o3
+
+	/* %o5: saved %asi, restored at NGbzero_done
+	 * %g7: store-init %asi to use
+	 * %o4:	non-store-init %asi to use
+	 */
+	rd		%asi, %o5
+	mov		ASI_BLK_INIT_QUAD_LDD_P, %g7
+	mov		ASI_P, %o4
+	wr		%o4, 0x0, %asi
+
+NGbzero_from_clear_user:
+	cmp		%o1, 15
+	bl,pn		%icc, NGbzero_tiny
+	 andcc		%o0, 0x7, %g1
+	be,pt		%xcc, 2f
+	 mov		8, %g2
+	sub		%g2, %g1, %g1
+	sub		%o1, %g1, %o1
+1:	EX_ST(stba %o2, [%o0 + 0x00] %asi)
+	subcc		%g1, 1, %g1
+	bne,pt		%xcc, 1b
+	 add		%o0, 1, %o0
+2:	cmp		%o1, 128
+	bl,pn		%icc, NGbzero_medium
+	 andcc		%o0, (64 - 1), %g1
+	be,pt		%xcc, NGbzero_pre_loop
+	 mov		64, %g2
+	sub		%g2, %g1, %g1
+	sub		%o1, %g1, %o1
+1:	EX_ST(stxa %o2, [%o0 + 0x00] %asi)
+	subcc		%g1, 8, %g1
+	bne,pt		%xcc, 1b
+	 add		%o0, 8, %o0
+
+NGbzero_pre_loop:
+	wr		%g7, 0x0, %asi
+	andn		%o1, (64 - 1), %g1
+	sub		%o1, %g1, %o1
+NGbzero_loop:
+	EX_ST(stxa %o2, [%o0 + 0x00] %asi)
+	EX_ST(stxa %o2, [%o0 + 0x08] %asi)
+	EX_ST(stxa %o2, [%o0 + 0x10] %asi)
+	EX_ST(stxa %o2, [%o0 + 0x18] %asi)
+	EX_ST(stxa %o2, [%o0 + 0x20] %asi)
+	EX_ST(stxa %o2, [%o0 + 0x28] %asi)
+	EX_ST(stxa %o2, [%o0 + 0x30] %asi)
+	EX_ST(stxa %o2, [%o0 + 0x38] %asi)
+	subcc		%g1, 64, %g1
+	bne,pt		%xcc, NGbzero_loop
+	 add		%o0, 64, %o0
+
+	wr		%o4, 0x0, %asi
+	brz,pn		%o1, NGbzero_done
+NGbzero_medium:
+	 andncc		%o1, 0x7, %g1
+	be,pn		%xcc, 2f
+	 sub		%o1, %g1, %o1
+1:	EX_ST(stxa %o2, [%o0 + 0x00] %asi)
+	subcc		%g1, 8, %g1
+	bne,pt		%xcc, 1b
+	 add		%o0, 8, %o0
+2:	brz,pt		%o1, NGbzero_done
+	 nop
+
+NGbzero_tiny:
+1:	EX_ST(stba %o2, [%o0 + 0x00] %asi)
+	subcc		%o1, 1, %o1
+	bne,pt		%icc, 1b
+	 add		%o0, 1, %o0
+
+	/* fallthrough */
+
+NGbzero_done:
+	wr		%o5, 0x0, %asi
+
+NGbzero_return:
+	retl
+	 mov		%o3, %o0
+	.size		NGbzero, .-NGbzero
+	.size		NGmemset, .-NGmemset
+
+	.globl		NGclear_user
+	.type		NGclear_user, #function
+NGclear_user:		/* %o0=buf, %o1=len */
+	rd		%asi, %o5
+	brz,pn		%o1, NGbzero_done
+	 clr		%o3
+	cmp		%o5, ASI_AIUS
+	bne,pn		%icc, NGbzero
+	 clr		%o2
+	mov		ASI_BLK_INIT_QUAD_LDD_AIUS, %g7
+	ba,pt		%xcc, NGbzero_from_clear_user
+	 mov		ASI_AIUS, %o4
+	.size		NGclear_user, .-NGclear_user
+
+#define BRANCH_ALWAYS	0x10680000
+#define NOP		0x01000000
+#define NG_DO_PATCH(OLD, NEW)	\
+	sethi	%hi(NEW), %g1; \
+	or	%g1, %lo(NEW), %g1; \
+	sethi	%hi(OLD), %g2; \
+	or	%g2, %lo(OLD), %g2; \
+	sub	%g1, %g2, %g1; \
+	sethi	%hi(BRANCH_ALWAYS), %g3; \
+	sll	%g1, 11, %g1; \
+	srl	%g1, 11 + 2, %g1; \
+	or	%g3, %lo(BRANCH_ALWAYS), %g3; \
+	or	%g3, %g1, %g3; \
+	stw	%g3, [%g2]; \
+	sethi	%hi(NOP), %g3; \
+	or	%g3, %lo(NOP), %g3; \
+	stw	%g3, [%g2 + 0x4]; \
+	flush	%g2;
+
+	.globl	niagara_patch_bzero
+	.type	niagara_patch_bzero,#function
+niagara_patch_bzero:
+	NG_DO_PATCH(memset, NGmemset)
+	NG_DO_PATCH(__bzero, NGbzero)
+	NG_DO_PATCH(__clear_user, NGclear_user)
+	NG_DO_PATCH(tsb_init, NGtsb_init)
+	retl
+	 nop
+	.size	niagara_patch_bzero,.-niagara_patch_bzero

+ 37 - 0
arch/sparc64/lib/NGcopy_from_user.S

@@ -0,0 +1,37 @@
+/* NGcopy_from_user.S: Niagara optimized copy from userspace.
+ *
+ * Copyright (C) 2006 David S. Miller (davem@davemloft.net)
+ */
+
+#define EX_LD(x)		\
+98:	x;			\
+	.section .fixup;	\
+	.align 4;		\
+99:	wr	%g0, ASI_AIUS, %asi;\
+	retl;			\
+	 mov	1, %o0;		\
+	.section __ex_table,"a";\
+	.align 4;		\
+	.word 98b, 99b;		\
+	.text;			\
+	.align 4;
+
+#ifndef ASI_AIUS
+#define ASI_AIUS	0x11
+#endif
+
+#define FUNC_NAME		NGcopy_from_user
+#define LOAD(type,addr,dest)	type##a [addr] ASI_AIUS, dest
+#define LOAD_TWIN(addr_reg,dest0,dest1)	\
+	ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_AIUS, dest0
+#define EX_RETVAL(x)		0
+
+#ifdef __KERNEL__
+#define PREAMBLE					\
+	rd		%asi, %g1;			\
+	cmp		%g1, ASI_AIUS;			\
+	bne,pn		%icc, memcpy_user_stub;		\
+	 nop
+#endif
+
+#include "NGmemcpy.S"

+ 40 - 0
arch/sparc64/lib/NGcopy_to_user.S

@@ -0,0 +1,40 @@
+/* NGcopy_to_user.S: Niagara optimized copy to userspace.
+ *
+ * Copyright (C) 2006 David S. Miller (davem@davemloft.net)
+ */
+
+#define EX_ST(x)		\
+98:	x;			\
+	.section .fixup;	\
+	.align 4;		\
+99:	wr	%g0, ASI_AIUS, %asi;\
+	retl;			\
+	 mov	1, %o0;		\
+	.section __ex_table,"a";\
+	.align 4;		\
+	.word 98b, 99b;		\
+	.text;			\
+	.align 4;
+
+#ifndef ASI_AIUS
+#define ASI_AIUS	0x11
+#endif
+
+#define FUNC_NAME		NGcopy_to_user
+#define STORE(type,src,addr)	type##a src, [addr] ASI_AIUS
+#define STORE_ASI		ASI_BLK_INIT_QUAD_LDD_AIUS
+#define EX_RETVAL(x)		0
+
+#ifdef __KERNEL__
+	/* Writing to %asi is _expensive_ so we hardcode it.
+	 * Reading %asi to check for KERNEL_DS is comparatively
+	 * cheap.
+	 */
+#define PREAMBLE					\
+	rd		%asi, %g1;			\
+	cmp		%g1, ASI_AIUS;			\
+	bne,pn		%icc, memcpy_user_stub;		\
+	 nop
+#endif
+
+#include "NGmemcpy.S"

+ 368 - 0
arch/sparc64/lib/NGmemcpy.S

@@ -0,0 +1,368 @@
+/* NGmemcpy.S: Niagara optimized memcpy.
+ *
+ * Copyright (C) 2006 David S. Miller (davem@davemloft.net)
+ */
+
+#ifdef __KERNEL__
+#include <asm/asi.h>
+#include <asm/thread_info.h>
+#define GLOBAL_SPARE	%g7
+#define RESTORE_ASI(TMP)	\
+	ldub	[%g6 + TI_CURRENT_DS], TMP;  \
+	wr	TMP, 0x0, %asi;
+#else
+#define GLOBAL_SPARE	%g5
+#define RESTORE_ASI(TMP)	\
+	wr	%g0, ASI_PNF, %asi
+#endif
+
+#ifndef STORE_ASI
+#define STORE_ASI	ASI_BLK_INIT_QUAD_LDD_P
+#endif
+
+#ifndef EX_LD
+#define EX_LD(x)	x
+#endif
+
+#ifndef EX_ST
+#define EX_ST(x)	x
+#endif
+
+#ifndef EX_RETVAL
+#define EX_RETVAL(x)	x
+#endif
+
+#ifndef LOAD
+#ifndef MEMCPY_DEBUG
+#define LOAD(type,addr,dest)	type [addr], dest
+#else
+#define LOAD(type,addr,dest)	type##a [addr] 0x80, dest
+#endif
+#endif
+
+#ifndef LOAD_TWIN
+#define LOAD_TWIN(addr_reg,dest0,dest1)	\
+	ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0
+#endif
+
+#ifndef STORE
+#define STORE(type,src,addr)	type src, [addr]
+#endif
+
+#ifndef STORE_INIT
+#define STORE_INIT(src,addr)	stxa src, [addr] %asi
+#endif
+
+#ifndef FUNC_NAME
+#define FUNC_NAME	NGmemcpy
+#endif
+
+#ifndef PREAMBLE
+#define PREAMBLE
+#endif
+
+#ifndef XCC
+#define XCC xcc
+#endif
+
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+
+	.text
+	.align		64
+
+	.globl	FUNC_NAME
+	.type	FUNC_NAME,#function
+FUNC_NAME:	/* %o0=dst, %o1=src, %o2=len */
+	srlx		%o2, 31, %g2
+	cmp		%g2, 0
+	tne		%xcc, 5
+	PREAMBLE
+	mov		%o0, GLOBAL_SPARE
+	cmp		%o2, 0
+	be,pn		%XCC, 85f
+	 or		%o0, %o1, %o3
+	cmp		%o2, 16
+	blu,a,pn	%XCC, 80f
+	 or		%o3, %o2, %o3
+
+	/* 2 blocks (128 bytes) is the minimum we can do the block
+	 * copy with.  We need to ensure that we'll iterate at least
+	 * once in the block copy loop.  At worst we'll need to align
+	 * the destination to a 64-byte boundary which can chew up
+	 * to (64 - 1) bytes from the length before we perform the
+	 * block copy loop.
+	 */
+	cmp		%o2, (2 * 64)
+	blu,pt		%XCC, 70f
+	 andcc		%o3, 0x7, %g0
+
+	/* %o0:	dst
+	 * %o1:	src
+	 * %o2:	len  (known to be >= 128)
+	 *
+	 * The block copy loops will use %o4/%o5,%g2/%g3 as
+	 * temporaries while copying the data.
+	 */
+
+	LOAD(prefetch, %o1, #one_read)
+	wr		%g0, STORE_ASI, %asi
+
+	/* Align destination on 64-byte boundary.  */
+	andcc		%o0, (64 - 1), %o4
+	be,pt		%XCC, 2f
+	 sub		%o4, 64, %o4
+	sub		%g0, %o4, %o4	! bytes to align dst
+	sub		%o2, %o4, %o2
+1:	subcc		%o4, 1, %o4
+	EX_LD(LOAD(ldub, %o1, %g1))
+	EX_ST(STORE(stb, %g1, %o0))
+	add		%o1, 1, %o1
+	bne,pt		%XCC, 1b
+	add		%o0, 1, %o0
+
+	/* If the source is on a 16-byte boundary we can do
+	 * the direct block copy loop.  If it is 8-byte aligned
+	 * we can do the 16-byte loads offset by -8 bytes and the
+	 * init stores offset by one register.
+	 *
+	 * If the source is not even 8-byte aligned, we need to do
+	 * shifting and masking (basically integer faligndata).
+	 *
+	 * The careful bit with init stores is that if we store
+	 * to any part of the cache line we have to store the whole
+	 * cacheline else we can end up with corrupt L2 cache line
+	 * contents.  Since the loop works on 64-bytes of 64-byte
+	 * aligned store data at a time, this is easy to ensure.
+	 */
+2:
+	andcc		%o1, (16 - 1), %o4
+	andn		%o2, (64 - 1), %g1	! block copy loop iterator
+	sub		%o2, %g1, %o2		! final sub-block copy bytes
+	be,pt		%XCC, 50f
+	 cmp		%o4, 8
+	be,a,pt		%XCC, 10f
+	 sub		%o1, 0x8, %o1
+
+	/* Neither 8-byte nor 16-byte aligned, shift and mask.  */
+	mov		%g1, %o4
+	and		%o1, 0x7, %g1
+	sll		%g1, 3, %g1
+	mov		64, %o3
+	andn		%o1, 0x7, %o1
+	EX_LD(LOAD(ldx, %o1, %g2))
+	sub		%o3, %g1, %o3
+	sllx		%g2, %g1, %g2
+
+#define SWIVEL_ONE_DWORD(SRC, TMP1, TMP2, PRE_VAL, PRE_SHIFT, POST_SHIFT, DST)\
+	EX_LD(LOAD(ldx, SRC, TMP1)); \
+	srlx		TMP1, PRE_SHIFT, TMP2; \
+	or		TMP2, PRE_VAL, TMP2; \
+	EX_ST(STORE_INIT(TMP2, DST)); \
+	sllx		TMP1, POST_SHIFT, PRE_VAL;
+
+1:	add		%o1, 0x8, %o1
+	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x00)
+	add		%o1, 0x8, %o1
+	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x08)
+	add		%o1, 0x8, %o1
+	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x10)
+	add		%o1, 0x8, %o1
+	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x18)
+	add		%o1, 32, %o1
+	LOAD(prefetch, %o1, #one_read)
+	sub		%o1, 32 - 8, %o1
+	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x20)
+	add		%o1, 8, %o1
+	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x28)
+	add		%o1, 8, %o1
+	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x30)
+	add		%o1, 8, %o1
+	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x38)
+	subcc		%o4, 64, %o4
+	bne,pt		%XCC, 1b
+	 add		%o0, 64, %o0
+
+#undef SWIVEL_ONE_DWORD
+
+	srl		%g1, 3, %g1
+	ba,pt		%XCC, 60f
+	 add		%o1, %g1, %o1
+
+10:	/* Destination is 64-byte aligned, source was only 8-byte
+	 * aligned but it has been subtracted by 8 and we perform
+	 * one twin load ahead, then add 8 back into source when
+	 * we finish the loop.
+	 */
+	EX_LD(LOAD_TWIN(%o1, %o4, %o5))
+1:	add		%o1, 16, %o1
+	EX_LD(LOAD_TWIN(%o1, %g2, %g3))
+	add		%o1, 16 + 32, %o1
+	LOAD(prefetch, %o1, #one_read)
+	sub		%o1, 32, %o1
+	EX_ST(STORE_INIT(%o5, %o0 + 0x00))	! initializes cache line
+	EX_ST(STORE_INIT(%g2, %o0 + 0x08))
+	EX_LD(LOAD_TWIN(%o1, %o4, %o5))
+	add		%o1, 16, %o1
+	EX_ST(STORE_INIT(%g3, %o0 + 0x10))
+	EX_ST(STORE_INIT(%o4, %o0 + 0x18))
+	EX_LD(LOAD_TWIN(%o1, %g2, %g3))
+	add		%o1, 16, %o1
+	EX_ST(STORE_INIT(%o5, %o0 + 0x20))
+	EX_ST(STORE_INIT(%g2, %o0 + 0x28))
+	EX_LD(LOAD_TWIN(%o1, %o4, %o5))
+	EX_ST(STORE_INIT(%g3, %o0 + 0x30))
+	EX_ST(STORE_INIT(%o4, %o0 + 0x38))
+	subcc		%g1, 64, %g1
+	bne,pt		%XCC, 1b
+	 add		%o0, 64, %o0
+
+	ba,pt		%XCC, 60f
+	 add		%o1, 0x8, %o1
+
+50:	/* Destination is 64-byte aligned, and source is 16-byte
+	 * aligned.
+	 */
+1:	EX_LD(LOAD_TWIN(%o1, %o4, %o5))
+	add	%o1, 16, %o1
+	EX_LD(LOAD_TWIN(%o1, %g2, %g3))
+	add	%o1, 16 + 32, %o1
+	LOAD(prefetch, %o1, #one_read)
+	sub	%o1, 32, %o1
+	EX_ST(STORE_INIT(%o4, %o0 + 0x00))	! initializes cache line
+	EX_ST(STORE_INIT(%o5, %o0 + 0x08))
+	EX_LD(LOAD_TWIN(%o1, %o4, %o5))
+	add	%o1, 16, %o1
+	EX_ST(STORE_INIT(%g2, %o0 + 0x10))
+	EX_ST(STORE_INIT(%g3, %o0 + 0x18))
+	EX_LD(LOAD_TWIN(%o1, %g2, %g3))
+	add	%o1, 16, %o1
+	EX_ST(STORE_INIT(%o4, %o0 + 0x20))
+	EX_ST(STORE_INIT(%o5, %o0 + 0x28))
+	EX_ST(STORE_INIT(%g2, %o0 + 0x30))
+	EX_ST(STORE_INIT(%g3, %o0 + 0x38))
+	subcc	%g1, 64, %g1
+	bne,pt	%XCC, 1b
+	 add	%o0, 64, %o0
+	/* fall through */
+
+60:	
+	/* %o2 contains any final bytes still needed to be copied
+	 * over. If anything is left, we copy it one byte at a time.
+	 */
+	RESTORE_ASI(%o3)
+	brz,pt		%o2, 85f
+	 sub		%o0, %o1, %o3
+	ba,a,pt		%XCC, 90f
+
+	.align		64
+70: /* 16 < len <= 64 */
+	bne,pn		%XCC, 75f
+	 sub		%o0, %o1, %o3
+
+72:
+	andn		%o2, 0xf, %o4
+	and		%o2, 0xf, %o2
+1:	subcc		%o4, 0x10, %o4
+	EX_LD(LOAD(ldx, %o1, %o5))
+	add		%o1, 0x08, %o1
+	EX_LD(LOAD(ldx, %o1, %g1))
+	sub		%o1, 0x08, %o1
+	EX_ST(STORE(stx, %o5, %o1 + %o3))
+	add		%o1, 0x8, %o1
+	EX_ST(STORE(stx, %g1, %o1 + %o3))
+	bgu,pt		%XCC, 1b
+	 add		%o1, 0x8, %o1
+73:	andcc		%o2, 0x8, %g0
+	be,pt		%XCC, 1f
+	 nop
+	sub		%o2, 0x8, %o2
+	EX_LD(LOAD(ldx, %o1, %o5))
+	EX_ST(STORE(stx, %o5, %o1 + %o3))
+	add		%o1, 0x8, %o1
+1:	andcc		%o2, 0x4, %g0
+	be,pt		%XCC, 1f
+	 nop
+	sub		%o2, 0x4, %o2
+	EX_LD(LOAD(lduw, %o1, %o5))
+	EX_ST(STORE(stw, %o5, %o1 + %o3))
+	add		%o1, 0x4, %o1
+1:	cmp		%o2, 0
+	be,pt		%XCC, 85f
+	 nop
+	ba,pt		%xcc, 90f
+	 nop
+
+75:
+	andcc		%o0, 0x7, %g1
+	sub		%g1, 0x8, %g1
+	be,pn		%icc, 2f
+	 sub		%g0, %g1, %g1
+	sub		%o2, %g1, %o2
+
+1:	subcc		%g1, 1, %g1
+	EX_LD(LOAD(ldub, %o1, %o5))
+	EX_ST(STORE(stb, %o5, %o1 + %o3))
+	bgu,pt		%icc, 1b
+	 add		%o1, 1, %o1
+
+2:	add		%o1, %o3, %o0
+	andcc		%o1, 0x7, %g1
+	bne,pt		%icc, 8f
+	 sll		%g1, 3, %g1
+
+	cmp		%o2, 16
+	bgeu,pt		%icc, 72b
+	 nop
+	ba,a,pt		%xcc, 73b
+
+8:	mov		64, %o3
+	andn		%o1, 0x7, %o1
+	EX_LD(LOAD(ldx, %o1, %g2))
+	sub		%o3, %g1, %o3
+	andn		%o2, 0x7, %o4
+	sllx		%g2, %g1, %g2
+1:	add		%o1, 0x8, %o1
+	EX_LD(LOAD(ldx, %o1, %g3))
+	subcc		%o4, 0x8, %o4
+	srlx		%g3, %o3, %o5
+	or		%o5, %g2, %o5
+	EX_ST(STORE(stx, %o5, %o0))
+	add		%o0, 0x8, %o0
+	bgu,pt		%icc, 1b
+	 sllx		%g3, %g1, %g2
+
+	srl		%g1, 3, %g1
+	andcc		%o2, 0x7, %o2
+	be,pn		%icc, 85f
+	 add		%o1, %g1, %o1
+	ba,pt		%xcc, 90f
+	 sub		%o0, %o1, %o3
+
+	.align		64
+80: /* 0 < len <= 16 */
+	andcc		%o3, 0x3, %g0
+	bne,pn		%XCC, 90f
+	 sub		%o0, %o1, %o3
+
+1:
+	subcc		%o2, 4, %o2
+	EX_LD(LOAD(lduw, %o1, %g1))
+	EX_ST(STORE(stw, %g1, %o1 + %o3))
+	bgu,pt		%XCC, 1b
+	 add		%o1, 4, %o1
+
+85:	retl
+	 mov		EX_RETVAL(GLOBAL_SPARE), %o0
+
+	.align		32
+90:
+	subcc		%o2, 1, %o2
+	EX_LD(LOAD(ldub, %o1, %g1))
+	EX_ST(STORE(stb, %g1, %o1 + %o3))
+	bgu,pt		%XCC, 90b
+	 add		%o1, 1, %o1
+	retl
+	 mov		EX_RETVAL(GLOBAL_SPARE), %o0
+
+	.size		FUNC_NAME, .-FUNC_NAME

+ 96 - 0
arch/sparc64/lib/NGpage.S

@@ -0,0 +1,96 @@
+/* NGpage.S: Niagara optimize clear and copy page.
+ *
+ * Copyright (C) 2006 (davem@davemloft.net)
+ */
+
+#include <asm/asi.h>
+#include <asm/page.h>
+
+	.text
+	.align	32
+
+	/* This is heavily simplified from the sun4u variants
+	 * because Niagara does not have any D-cache aliasing issues
+	 * and also we don't need to use the FPU in order to implement
+	 * an optimal page copy/clear.
+	 */
+
+NGcopy_user_page:	/* %o0=dest, %o1=src, %o2=vaddr */
+	prefetch	[%o1 + 0x00], #one_read
+	mov		8, %g1
+	mov		16, %g2
+	mov		24, %g3
+	set		PAGE_SIZE, %g7
+
+1:	ldda		[%o1 + %g0] ASI_BLK_INIT_QUAD_LDD_P, %o2
+	ldda		[%o1 + %g2] ASI_BLK_INIT_QUAD_LDD_P, %o4
+	prefetch	[%o1 + 0x40], #one_read
+	add		%o1, 32, %o1
+	stxa		%o2, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P
+	stxa		%o3, [%o0 + %g1] ASI_BLK_INIT_QUAD_LDD_P
+	ldda		[%o1 + %g0] ASI_BLK_INIT_QUAD_LDD_P, %o2
+	stxa		%o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P
+	stxa		%o5, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P
+	ldda		[%o1 + %g2] ASI_BLK_INIT_QUAD_LDD_P, %o4
+	add		%o1, 32, %o1
+	add		%o0, 32, %o0
+	stxa		%o2, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P
+	stxa		%o3, [%o0 + %g1] ASI_BLK_INIT_QUAD_LDD_P
+	stxa		%o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P
+	stxa		%o5, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P
+	subcc		%g7, 64, %g7
+	bne,pt		%xcc, 1b
+	 add		%o0, 32, %o0
+	retl
+	 nop
+
+NGclear_page:		/* %o0=dest */
+NGclear_user_page:	/* %o0=dest, %o1=vaddr */
+	mov		8, %g1
+	mov		16, %g2
+	mov		24, %g3
+	set		PAGE_SIZE, %g7
+
+1:	stxa		%g0, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P
+	stxa		%g0, [%o0 + %g1] ASI_BLK_INIT_QUAD_LDD_P
+	stxa		%g0, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P
+	stxa		%g0, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P
+	add		%o0, 32, %o0
+	stxa		%g0, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P
+	stxa		%g0, [%o0 + %g1] ASI_BLK_INIT_QUAD_LDD_P
+	stxa		%g0, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P
+	stxa		%g0, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P
+	subcc		%g7, 64, %g7
+	bne,pt		%xcc, 1b
+	 add		%o0, 32, %o0
+	retl
+	 nop
+
+#define BRANCH_ALWAYS	0x10680000
+#define NOP		0x01000000
+#define NG_DO_PATCH(OLD, NEW)	\
+	sethi	%hi(NEW), %g1; \
+	or	%g1, %lo(NEW), %g1; \
+	sethi	%hi(OLD), %g2; \
+	or	%g2, %lo(OLD), %g2; \
+	sub	%g1, %g2, %g1; \
+	sethi	%hi(BRANCH_ALWAYS), %g3; \
+	sll	%g1, 11, %g1; \
+	srl	%g1, 11 + 2, %g1; \
+	or	%g3, %lo(BRANCH_ALWAYS), %g3; \
+	or	%g3, %g1, %g3; \
+	stw	%g3, [%g2]; \
+	sethi	%hi(NOP), %g3; \
+	or	%g3, %lo(NOP), %g3; \
+	stw	%g3, [%g2 + 0x4]; \
+	flush	%g2;
+
+	.globl	niagara_patch_pageops
+	.type	niagara_patch_pageops,#function
+niagara_patch_pageops:
+	NG_DO_PATCH(copy_user_page, NGcopy_user_page)
+	NG_DO_PATCH(_clear_page, NGclear_page)
+	NG_DO_PATCH(clear_user_page, NGclear_user_page)
+	retl
+	 nop
+	.size	niagara_patch_pageops,.-niagara_patch_pageops

+ 33 - 0
arch/sparc64/lib/NGpatch.S

@@ -0,0 +1,33 @@
+/* NGpatch.S: Patch Ultra-I routines with Niagara variant.
+ *
+ * Copyright (C) 2006 David S. Miller <davem@davemloft.net>
+ */
+
+#define BRANCH_ALWAYS	0x10680000
+#define NOP		0x01000000
+#define NG_DO_PATCH(OLD, NEW)	\
+	sethi	%hi(NEW), %g1; \
+	or	%g1, %lo(NEW), %g1; \
+	sethi	%hi(OLD), %g2; \
+	or	%g2, %lo(OLD), %g2; \
+	sub	%g1, %g2, %g1; \
+	sethi	%hi(BRANCH_ALWAYS), %g3; \
+	sll	%g1, 11, %g1; \
+	srl	%g1, 11 + 2, %g1; \
+	or	%g3, %lo(BRANCH_ALWAYS), %g3; \
+	or	%g3, %g1, %g3; \
+	stw	%g3, [%g2]; \
+	sethi	%hi(NOP), %g3; \
+	or	%g3, %lo(NOP), %g3; \
+	stw	%g3, [%g2 + 0x4]; \
+	flush	%g2;
+
+	.globl	niagara_patch_copyops
+	.type	niagara_patch_copyops,#function
+niagara_patch_copyops:
+	NG_DO_PATCH(memcpy, NGmemcpy)
+	NG_DO_PATCH(___copy_from_user, NGcopy_from_user)
+	NG_DO_PATCH(___copy_to_user, NGcopy_to_user)
+	retl
+	 nop
+	.size	niagara_patch_copyops,.-niagara_patch_copyops

+ 2 - 1
arch/sparc64/lib/U3patch.S

@@ -12,7 +12,8 @@
 	or	%g2, %lo(OLD), %g2; \
 	or	%g2, %lo(OLD), %g2; \
 	sub	%g1, %g2, %g1; \
 	sub	%g1, %g2, %g1; \
 	sethi	%hi(BRANCH_ALWAYS), %g3; \
 	sethi	%hi(BRANCH_ALWAYS), %g3; \
-	srl	%g1, 2, %g1; \
+	sll	%g1, 11, %g1; \
+	srl	%g1, 11 + 2, %g1; \
 	or	%g3, %lo(BRANCH_ALWAYS), %g3; \
 	or	%g3, %lo(BRANCH_ALWAYS), %g3; \
 	or	%g3, %g1, %g3; \
 	or	%g3, %g1, %g3; \
 	stw	%g3, [%g2]; \
 	stw	%g3, [%g2]; \

+ 9 - 9
arch/sparc64/lib/bzero.S

@@ -98,12 +98,12 @@ __bzero_done:
 	.text;			\
 	.text;			\
 	.align 4;
 	.align 4;
 
 
-	.globl	__bzero_noasi
-	.type	__bzero_noasi, #function
-__bzero_noasi:		/* %o0=buf, %o1=len */
-	brz,pn		%o1, __bzero_noasi_done
+	.globl	__clear_user
+	.type	__clear_user, #function
+__clear_user:		/* %o0=buf, %o1=len */
+	brz,pn		%o1, __clear_user_done
 	 cmp		%o1, 16
 	 cmp		%o1, 16
-	bl,pn		%icc, __bzero_noasi_tiny
+	bl,pn		%icc, __clear_user_tiny
 	 EX_ST(prefetcha [%o0 + 0x00] %asi, #n_writes)
 	 EX_ST(prefetcha [%o0 + 0x00] %asi, #n_writes)
 	andcc		%o0, 0x3, %g0
 	andcc		%o0, 0x3, %g0
 	be,pt		%icc, 2f
 	be,pt		%icc, 2f
@@ -145,14 +145,14 @@ __bzero_noasi:		/* %o0=buf, %o1=len */
 	subcc		%g1, 8, %g1
 	subcc		%g1, 8, %g1
 	bne,pt		%icc, 5b
 	bne,pt		%icc, 5b
 	 add		%o0, 0x8, %o0
 	 add		%o0, 0x8, %o0
-6:	brz,pt		%o1, __bzero_noasi_done
+6:	brz,pt		%o1, __clear_user_done
 	 nop
 	 nop
-__bzero_noasi_tiny:
+__clear_user_tiny:
 1:	EX_ST(stba	%g0, [%o0 + 0x00] %asi)
 1:	EX_ST(stba	%g0, [%o0 + 0x00] %asi)
 	subcc		%o1, 1, %o1
 	subcc		%o1, 1, %o1
 	bne,pt		%icc, 1b
 	bne,pt		%icc, 1b
 	 add		%o0, 1, %o0
 	 add		%o0, 1, %o0
-__bzero_noasi_done:
+__clear_user_done:
 	retl
 	retl
 	 clr		%o0
 	 clr		%o0
-	.size		__bzero_noasi, .-__bzero_noasi
+	.size		__clear_user, .-__clear_user

+ 5 - 7
arch/sparc64/lib/clear_page.S

@@ -9,6 +9,7 @@
 #include <asm/page.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/pgtable.h>
 #include <asm/spitfire.h>
 #include <asm/spitfire.h>
+#include <asm/head.h>
 
 
 	/* What we used to do was lock a TLB entry into a specific
 	/* What we used to do was lock a TLB entry into a specific
 	 * TLB slot, clear the page with interrupts disabled, then
 	 * TLB slot, clear the page with interrupts disabled, then
@@ -22,9 +23,6 @@
 	 * disable preemption during the clear.
 	 * disable preemption during the clear.
 	 */
 	 */
 
 
-#define TTE_BITS_TOP	(_PAGE_VALID | _PAGE_SZBITS)
-#define TTE_BITS_BOTTOM	(_PAGE_CP | _PAGE_CV | _PAGE_P | _PAGE_L | _PAGE_W)
-
 	.text
 	.text
 
 
 	.globl		_clear_page
 	.globl		_clear_page
@@ -43,12 +41,11 @@ clear_user_page:	/* %o0=dest, %o1=vaddr */
 	sethi		%hi(PAGE_SIZE), %o4
 	sethi		%hi(PAGE_SIZE), %o4
 
 
 	sllx		%g2, 32, %g2
 	sllx		%g2, 32, %g2
-	sethi		%uhi(TTE_BITS_TOP), %g3
+	sethi		%hi(PAGE_KERNEL_LOCKED), %g3
 
 
-	sllx		%g3, 32, %g3
+	ldx		[%g3 + %lo(PAGE_KERNEL_LOCKED)], %g3
 	sub		%o0, %g2, %g1		! paddr
 	sub		%o0, %g2, %g1		! paddr
 
 
-	or		%g3, TTE_BITS_BOTTOM, %g3
 	and		%o1, %o4, %o0		! vaddr D-cache alias bit
 	and		%o1, %o4, %o0		! vaddr D-cache alias bit
 
 
 	or		%g1, %g3, %g1		! TTE data
 	or		%g1, %g3, %g1		! TTE data
@@ -66,7 +63,8 @@ clear_user_page:	/* %o0=dest, %o1=vaddr */
 	wrpr		%o4, PSTATE_IE, %pstate
 	wrpr		%o4, PSTATE_IE, %pstate
 	stxa		%o0, [%g3] ASI_DMMU
 	stxa		%o0, [%g3] ASI_DMMU
 	stxa		%g1, [%g0] ASI_DTLB_DATA_IN
 	stxa		%g1, [%g0] ASI_DTLB_DATA_IN
-	flush		%g6
+	sethi		%hi(KERNBASE), %g1
+	flush		%g1
 	wrpr		%o4, 0x0, %pstate
 	wrpr		%o4, 0x0, %pstate
 
 
 	mov		1, %o4
 	mov		1, %o4

+ 2 - 5
arch/sparc64/lib/copy_page.S

@@ -23,8 +23,6 @@
 	 * disable preemption during the clear.
 	 * disable preemption during the clear.
 	 */
 	 */
 
 
-#define TTE_BITS_TOP	(_PAGE_VALID | _PAGE_SZBITS)
-#define TTE_BITS_BOTTOM	(_PAGE_CP | _PAGE_CV | _PAGE_P | _PAGE_L | _PAGE_W)
 #define	DCACHE_SIZE	(PAGE_SIZE * 2)
 #define	DCACHE_SIZE	(PAGE_SIZE * 2)
 
 
 #if (PAGE_SHIFT == 13) || (PAGE_SHIFT == 19)
 #if (PAGE_SHIFT == 13) || (PAGE_SHIFT == 19)
@@ -52,13 +50,12 @@ copy_user_page:		/* %o0=dest, %o1=src, %o2=vaddr */
 	sethi		%hi(PAGE_SIZE), %o3
 	sethi		%hi(PAGE_SIZE), %o3
 
 
 	sllx		%g2, 32, %g2
 	sllx		%g2, 32, %g2
-	sethi		%uhi(TTE_BITS_TOP), %g3
+	sethi		%hi(PAGE_KERNEL_LOCKED), %g3
 
 
-	sllx		%g3, 32, %g3
+	ldx		[%g3 + %lo(PAGE_KERNEL_LOCKED)], %g3
 	sub		%o0, %g2, %g1		! dest paddr
 	sub		%o0, %g2, %g1		! dest paddr
 
 
 	sub		%o1, %g2, %g2		! src paddr
 	sub		%o1, %g2, %g2		! src paddr
-	or		%g3, TTE_BITS_BOTTOM, %g3
 
 
 	and		%o2, %o3, %o0		! vaddr D-cache alias bit
 	and		%o2, %o3, %o0		! vaddr D-cache alias bit
 	or		%g1, %g3, %g1		! dest TTE data
 	or		%g1, %g3, %g1		! dest TTE data

+ 8 - 11
arch/sparc64/lib/delay.c

@@ -1,6 +1,6 @@
 /* delay.c: Delay loops for sparc64
 /* delay.c: Delay loops for sparc64
  *
  *
- * Copyright (C) 2004 David S. Miller <davem@redhat.com>
+ * Copyright (C) 2004, 2006 David S. Miller <davem@davemloft.net>
  *
  *
  * Based heavily upon x86 variant which is:
  * Based heavily upon x86 variant which is:
  *	Copyright (C) 1993 Linus Torvalds
  *	Copyright (C) 1993 Linus Torvalds
@@ -8,19 +8,16 @@
  */
  */
 
 
 #include <linux/delay.h>
 #include <linux/delay.h>
+#include <asm/timer.h>
 
 
 void __delay(unsigned long loops)
 void __delay(unsigned long loops)
 {
 {
-	__asm__ __volatile__(
-"	b,pt	%%xcc, 1f\n"
-"	 cmp	%0, 0\n"
-"	.align	32\n"
-"1:\n"
-"	bne,pt	%%xcc, 1b\n"
-"	 subcc	%0, 1, %0\n"
-	: "=&r" (loops)
-	: "0" (loops)
-	: "cc");
+	unsigned long bclock, now;
+	
+	bclock = tick_ops->get_tick();
+	do {
+		now = tick_ops->get_tick();
+	} while ((now-bclock) < loops);
 }
 }
 
 
 /* We used to multiply by HZ after shifting down by 32 bits
 /* We used to multiply by HZ after shifting down by 32 bits

+ 299 - 1
arch/sparc64/lib/xor.S

@@ -2,9 +2,10 @@
  * arch/sparc64/lib/xor.S
  * arch/sparc64/lib/xor.S
  *
  *
  * High speed xor_block operation for RAID4/5 utilizing the
  * High speed xor_block operation for RAID4/5 utilizing the
- * UltraSparc Visual Instruction Set.
+ * UltraSparc Visual Instruction Set and Niagara store-init/twin-load.
  *
  *
  * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
  * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
+ * Copyright (C) 2006 David S. Miller <davem@davemloft.net>
  */
  */
 
 
 #include <asm/visasm.h>
 #include <asm/visasm.h>
@@ -19,6 +20,8 @@
  */
  */
 	.text
 	.text
 	.align	32
 	.align	32
+
+	/* VIS versions. */
 	.globl	xor_vis_2
 	.globl	xor_vis_2
 	.type	xor_vis_2,#function
 	.type	xor_vis_2,#function
 xor_vis_2:
 xor_vis_2:
@@ -352,3 +355,298 @@ xor_vis_5:
 	ret
 	ret
 	 restore
 	 restore
 	.size	xor_vis_5, .-xor_vis_5
 	.size	xor_vis_5, .-xor_vis_5
+
+	/* Niagara versions. */
+	.globl		xor_niagara_2
+	.type		xor_niagara_2,#function
+xor_niagara_2:		/* %o0=bytes, %o1=dest, %o2=src */
+	save		%sp, -192, %sp
+	prefetch	[%i1], #n_writes
+	prefetch	[%i2], #one_read
+	rd		%asi, %g7
+	wr		%g0, ASI_BLK_INIT_QUAD_LDD_P, %asi
+	srlx		%i0, 6, %g1
+	mov		%i1, %i0
+	mov		%i2, %i1
+1:	ldda		[%i1 + 0x00] %asi, %i2	/* %i2/%i3 = src  + 0x00 */
+	ldda		[%i1 + 0x10] %asi, %i4	/* %i4/%i5 = src  + 0x10 */
+	ldda		[%i1 + 0x20] %asi, %g2	/* %g2/%g3 = src  + 0x20 */
+	ldda		[%i1 + 0x30] %asi, %l0	/* %l0/%l1 = src  + 0x30 */
+	prefetch	[%i1 + 0x40], #one_read
+	ldda		[%i0 + 0x00] %asi, %o0  /* %o0/%o1 = dest + 0x00 */
+	ldda		[%i0 + 0x10] %asi, %o2  /* %o2/%o3 = dest + 0x10 */
+	ldda		[%i0 + 0x20] %asi, %o4  /* %o4/%o5 = dest + 0x20 */
+	ldda		[%i0 + 0x30] %asi, %l2  /* %l2/%l3 = dest + 0x30 */
+	prefetch	[%i0 + 0x40], #n_writes
+	xor		%o0, %i2, %o0
+	xor		%o1, %i3, %o1
+	stxa		%o0, [%i0 + 0x00] %asi
+	stxa		%o1, [%i0 + 0x08] %asi
+	xor		%o2, %i4, %o2
+	xor		%o3, %i5, %o3
+	stxa		%o2, [%i0 + 0x10] %asi
+	stxa		%o3, [%i0 + 0x18] %asi
+	xor		%o4, %g2, %o4
+	xor		%o5, %g3, %o5
+	stxa		%o4, [%i0 + 0x20] %asi
+	stxa		%o5, [%i0 + 0x28] %asi
+	xor		%l2, %l0, %l2
+	xor		%l3, %l1, %l3
+	stxa		%l2, [%i0 + 0x30] %asi
+	stxa		%l3, [%i0 + 0x38] %asi
+	add		%i0, 0x40, %i0
+	subcc		%g1, 1, %g1
+	bne,pt		%xcc, 1b
+	 add		%i1, 0x40, %i1
+	membar		#Sync
+	wr		%g7, 0x0, %asi
+	ret
+	 restore
+	.size		xor_niagara_2, .-xor_niagara_2
+
+	.globl		xor_niagara_3
+	.type		xor_niagara_3,#function
+xor_niagara_3:		/* %o0=bytes, %o1=dest, %o2=src1, %o3=src2 */
+	save		%sp, -192, %sp
+	prefetch	[%i1], #n_writes
+	prefetch	[%i2], #one_read
+	prefetch	[%i3], #one_read
+	rd		%asi, %g7
+	wr		%g0, ASI_BLK_INIT_QUAD_LDD_P, %asi
+	srlx		%i0, 6, %g1
+	mov		%i1, %i0
+	mov		%i2, %i1
+	mov		%i3, %l7
+1:	ldda		[%i1 + 0x00] %asi, %i2	/* %i2/%i3 = src1 + 0x00 */
+	ldda		[%i1 + 0x10] %asi, %i4	/* %i4/%i5 = src1 + 0x10 */
+	ldda		[%l7 + 0x00] %asi, %g2	/* %g2/%g3 = src2 + 0x00 */
+	ldda		[%l7 + 0x10] %asi, %l0	/* %l0/%l1 = src2 + 0x10 */
+	ldda		[%i0 + 0x00] %asi, %o0  /* %o0/%o1 = dest + 0x00 */
+	ldda		[%i0 + 0x10] %asi, %o2  /* %o2/%o3 = dest + 0x10 */
+	xor		%g2, %i2, %g2
+	xor		%g3, %i3, %g3
+	xor		%o0, %g2, %o0
+	xor		%o1, %g3, %o1
+	stxa		%o0, [%i0 + 0x00] %asi
+	stxa		%o1, [%i0 + 0x08] %asi
+	ldda		[%i1 + 0x20] %asi, %i2	/* %i2/%i3 = src1 + 0x20 */
+	ldda		[%l7 + 0x20] %asi, %g2	/* %g2/%g3 = src2 + 0x20 */
+	ldda		[%i0 + 0x20] %asi, %o0	/* %o0/%o1 = dest + 0x20 */
+	xor		%l0, %i4, %l0
+	xor		%l1, %i5, %l1
+	xor		%o2, %l0, %o2
+	xor		%o3, %l1, %o3
+	stxa		%o2, [%i0 + 0x10] %asi
+	stxa		%o3, [%i0 + 0x18] %asi
+	ldda		[%i1 + 0x30] %asi, %i4	/* %i4/%i5 = src1 + 0x30 */
+	ldda		[%l7 + 0x30] %asi, %l0	/* %l0/%l1 = src2 + 0x30 */
+	ldda		[%i0 + 0x30] %asi, %o2	/* %o2/%o3 = dest + 0x30 */
+	prefetch	[%i1 + 0x40], #one_read
+	prefetch	[%l7 + 0x40], #one_read
+	prefetch	[%i0 + 0x40], #n_writes
+	xor		%g2, %i2, %g2
+	xor		%g3, %i3, %g3
+	xor		%o0, %g2, %o0
+	xor		%o1, %g3, %o1
+	stxa		%o0, [%i0 + 0x20] %asi
+	stxa		%o1, [%i0 + 0x28] %asi
+	xor		%l0, %i4, %l0
+	xor		%l1, %i5, %l1
+	xor		%o2, %l0, %o2
+	xor		%o3, %l1, %o3
+	stxa		%o2, [%i0 + 0x30] %asi
+	stxa		%o3, [%i0 + 0x38] %asi
+	add		%i0, 0x40, %i0
+	add		%i1, 0x40, %i1
+	subcc		%g1, 1, %g1
+	bne,pt		%xcc, 1b
+	 add		%l7, 0x40, %l7
+	membar		#Sync
+	wr		%g7, 0x0, %asi
+	ret
+	 restore
+	.size		xor_niagara_3, .-xor_niagara_3
+
+	.globl		xor_niagara_4
+	.type		xor_niagara_4,#function
+xor_niagara_4:		/* %o0=bytes, %o1=dest, %o2=src1, %o3=src2, %o4=src3 */
+	save		%sp, -192, %sp
+	prefetch	[%i1], #n_writes
+	prefetch	[%i2], #one_read
+	prefetch	[%i3], #one_read
+	prefetch	[%i4], #one_read
+	rd		%asi, %g7
+	wr		%g0, ASI_BLK_INIT_QUAD_LDD_P, %asi
+	srlx		%i0, 6, %g1
+	mov		%i1, %i0
+	mov		%i2, %i1
+	mov		%i3, %l7
+	mov		%i4, %l6
+1:	ldda		[%i1 + 0x00] %asi, %i2	/* %i2/%i3 = src1 + 0x00 */
+	ldda		[%l7 + 0x00] %asi, %i4	/* %i4/%i5 = src2 + 0x00 */
+	ldda		[%l6 + 0x00] %asi, %g2	/* %g2/%g3 = src3 + 0x00 */
+	ldda		[%i0 + 0x00] %asi, %l0	/* %l0/%l1 = dest + 0x00 */
+	xor		%i4, %i2, %i4
+	xor		%i5, %i3, %i5
+	ldda		[%i1 + 0x10] %asi, %i2	/* %i2/%i3 = src1 + 0x10 */
+	xor		%g2, %i4, %g2
+	xor		%g3, %i5, %g3
+	ldda		[%i7 + 0x10] %asi, %i4	/* %i4/%i5 = src2 + 0x10 */
+	xor		%l0, %g2, %l0
+	xor		%l1, %g3, %l1
+	stxa		%l0, [%i0 + 0x00] %asi
+	stxa		%l1, [%i0 + 0x08] %asi
+	ldda		[%i6 + 0x10] %asi, %g2	/* %g2/%g3 = src3 + 0x10 */
+	ldda		[%i0 + 0x10] %asi, %l0	/* %l0/%l1 = dest + 0x10 */
+
+	xor		%i4, %i2, %i4
+	xor		%i5, %i3, %i5
+	ldda		[%i1 + 0x20] %asi, %i2	/* %i2/%i3 = src1 + 0x20 */
+	xor		%g2, %i4, %g2
+	xor		%g3, %i5, %g3
+	ldda		[%i7 + 0x20] %asi, %i4	/* %i4/%i5 = src2 + 0x20 */
+	xor		%l0, %g2, %l0
+	xor		%l1, %g3, %l1
+	stxa		%l0, [%i0 + 0x10] %asi
+	stxa		%l1, [%i0 + 0x18] %asi
+	ldda		[%i6 + 0x20] %asi, %g2	/* %g2/%g3 = src3 + 0x20 */
+	ldda		[%i0 + 0x20] %asi, %l0	/* %l0/%l1 = dest + 0x20 */
+
+	xor		%i4, %i2, %i4
+	xor		%i5, %i3, %i5
+	ldda		[%i1 + 0x30] %asi, %i2	/* %i2/%i3 = src1 + 0x30 */
+	xor		%g2, %i4, %g2
+	xor		%g3, %i5, %g3
+	ldda		[%i7 + 0x30] %asi, %i4	/* %i4/%i5 = src2 + 0x30 */
+	xor		%l0, %g2, %l0
+	xor		%l1, %g3, %l1
+	stxa		%l0, [%i0 + 0x20] %asi
+	stxa		%l1, [%i0 + 0x28] %asi
+	ldda		[%i6 + 0x30] %asi, %g2	/* %g2/%g3 = src3 + 0x30 */
+	ldda		[%i0 + 0x30] %asi, %l0	/* %l0/%l1 = dest + 0x30 */
+
+	prefetch	[%i1 + 0x40], #one_read
+	prefetch	[%l7 + 0x40], #one_read
+	prefetch	[%l6 + 0x40], #one_read
+	prefetch	[%i0 + 0x40], #n_writes
+
+	xor		%i4, %i2, %i4
+	xor		%i5, %i3, %i5
+	xor		%g2, %i4, %g2
+	xor		%g3, %i5, %g3
+	xor		%l0, %g2, %l0
+	xor		%l1, %g3, %l1
+	stxa		%l0, [%i0 + 0x30] %asi
+	stxa		%l1, [%i0 + 0x38] %asi
+
+	add		%i0, 0x40, %i0
+	add		%i1, 0x40, %i1
+	add		%l7, 0x40, %l7
+	subcc		%g1, 1, %g1
+	bne,pt		%xcc, 1b
+	 add		%l6, 0x40, %l6
+	membar		#Sync
+	wr		%g7, 0x0, %asi
+	ret
+	 restore
+	.size		xor_niagara_4, .-xor_niagara_4
+
+	.globl		xor_niagara_5
+	.type		xor_niagara_5,#function
+xor_niagara_5:		/* %o0=bytes, %o1=dest, %o2=src1, %o3=src2, %o4=src3, %o5=src4 */
+	save		%sp, -192, %sp
+	prefetch	[%i1], #n_writes
+	prefetch	[%i2], #one_read
+	prefetch	[%i3], #one_read
+	prefetch	[%i4], #one_read
+	prefetch	[%i5], #one_read
+	rd		%asi, %g7
+	wr		%g0, ASI_BLK_INIT_QUAD_LDD_P, %asi
+	srlx		%i0, 6, %g1
+	mov		%i1, %i0
+	mov		%i2, %i1
+	mov		%i3, %l7
+	mov		%i4, %l6
+	mov		%i5, %l5
+1:	ldda		[%i1 + 0x00] %asi, %i2	/* %i2/%i3 = src1 + 0x00 */
+	ldda		[%l7 + 0x00] %asi, %i4	/* %i4/%i5 = src2 + 0x00 */
+	ldda		[%l6 + 0x00] %asi, %g2	/* %g2/%g3 = src3 + 0x00 */
+	ldda		[%l5 + 0x00] %asi, %l0	/* %l0/%l1 = src4 + 0x00 */
+	ldda		[%i0 + 0x00] %asi, %l2	/* %l2/%l3 = dest + 0x00 */
+	xor		%i4, %i2, %i4
+	xor		%i5, %i3, %i5
+	ldda		[%i1 + 0x10] %asi, %i2	/* %i2/%i3 = src1 + 0x10 */
+	xor		%g2, %i4, %g2
+	xor		%g3, %i5, %g3
+	ldda		[%l7 + 0x10] %asi, %i4	/* %i4/%i5 = src2 + 0x10 */
+	xor		%l0, %g2, %l0
+	xor		%l1, %g3, %l1
+	ldda		[%l6 + 0x10] %asi, %g2	/* %g2/%g3 = src3 + 0x10 */
+	xor		%l2, %l0, %l2
+	xor		%l3, %l1, %l3
+	stxa		%l2, [%i0 + 0x00] %asi
+	stxa		%l3, [%i0 + 0x08] %asi
+	ldda		[%l5 + 0x10] %asi, %l0	/* %l0/%l1 = src4 + 0x10 */
+	ldda		[%i0 + 0x10] %asi, %l2	/* %l2/%l3 = dest + 0x10 */
+
+	xor		%i4, %i2, %i4
+	xor		%i5, %i3, %i5
+	ldda		[%i1 + 0x20] %asi, %i2	/* %i2/%i3 = src1 + 0x20 */
+	xor		%g2, %i4, %g2
+	xor		%g3, %i5, %g3
+	ldda		[%l7 + 0x20] %asi, %i4	/* %i4/%i5 = src2 + 0x20 */
+	xor		%l0, %g2, %l0
+	xor		%l1, %g3, %l1
+	ldda		[%l6 + 0x20] %asi, %g2	/* %g2/%g3 = src3 + 0x20 */
+	xor		%l2, %l0, %l2
+	xor		%l3, %l1, %l3
+	stxa		%l2, [%i0 + 0x10] %asi
+	stxa		%l3, [%i0 + 0x18] %asi
+	ldda		[%l5 + 0x20] %asi, %l0	/* %l0/%l1 = src4 + 0x20 */
+	ldda		[%i0 + 0x20] %asi, %l2	/* %l2/%l3 = dest + 0x20 */
+
+	xor		%i4, %i2, %i4
+	xor		%i5, %i3, %i5
+	ldda		[%i1 + 0x30] %asi, %i2	/* %i2/%i3 = src1 + 0x30 */
+	xor		%g2, %i4, %g2
+	xor		%g3, %i5, %g3
+	ldda		[%l7 + 0x30] %asi, %i4	/* %i4/%i5 = src2 + 0x30 */
+	xor		%l0, %g2, %l0
+	xor		%l1, %g3, %l1
+	ldda		[%l6 + 0x30] %asi, %g2	/* %g2/%g3 = src3 + 0x30 */
+	xor		%l2, %l0, %l2
+	xor		%l3, %l1, %l3
+	stxa		%l2, [%i0 + 0x20] %asi
+	stxa		%l3, [%i0 + 0x28] %asi
+	ldda		[%l5 + 0x30] %asi, %l0	/* %l0/%l1 = src4 + 0x30 */
+	ldda		[%i0 + 0x30] %asi, %l2	/* %l2/%l3 = dest + 0x30 */
+
+	prefetch	[%i1 + 0x40], #one_read
+	prefetch	[%l7 + 0x40], #one_read
+	prefetch	[%l6 + 0x40], #one_read
+	prefetch	[%l5 + 0x40], #one_read
+	prefetch	[%i0 + 0x40], #n_writes
+
+	xor		%i4, %i2, %i4
+	xor		%i5, %i3, %i5
+	xor		%g2, %i4, %g2
+	xor		%g3, %i5, %g3
+	xor		%l0, %g2, %l0
+	xor		%l1, %g3, %l1
+	xor		%l2, %l0, %l2
+	xor		%l3, %l1, %l3
+	stxa		%l2, [%i0 + 0x30] %asi
+	stxa		%l3, [%i0 + 0x38] %asi
+
+	add		%i0, 0x40, %i0
+	add		%i1, 0x40, %i1
+	add		%l7, 0x40, %l7
+	add		%l6, 0x40, %l6
+	subcc		%g1, 1, %g1
+	bne,pt		%xcc, 1b
+	 add		%l5, 0x40, %l5
+	membar		#Sync
+	wr		%g7, 0x0, %asi
+	ret
+	 restore
+	.size		xor_niagara_5, .-xor_niagara_5

+ 22 - 2
arch/sparc64/math-emu/math.c

@@ -206,9 +206,29 @@ int do_mathemu(struct pt_regs *regs, struct fpustate *f)
 			case FSTOQ: TYPE(3,3,1,1,1,0,0); break;
 			case FSTOQ: TYPE(3,3,1,1,1,0,0); break;
 			case FDTOQ: TYPE(3,3,1,2,1,0,0); break;
 			case FDTOQ: TYPE(3,3,1,2,1,0,0); break;
 			case FQTOI: TYPE(3,1,0,3,1,0,0); break;
 			case FQTOI: TYPE(3,1,0,3,1,0,0); break;
+
+			/* We can get either unimplemented or unfinished
+			 * for these cases.  Pre-Niagara systems generate
+			 * unfinished fpop for SUBNORMAL cases, and Niagara
+			 * always gives unimplemented fpop for fsqrt{s,d}.
+			 */
+			case FSQRTS: {
+				unsigned long x = current_thread_info()->xfsr[0];
+
+				x = (x >> 14) & 0xf;
+				TYPE(x,1,1,1,1,0,0);
+				break;
+			}
+
+			case FSQRTD: {
+				unsigned long x = current_thread_info()->xfsr[0];
+
+				x = (x >> 14) & 0xf;
+				TYPE(x,2,1,2,1,0,0);
+				break;
+			}
+
 			/* SUBNORMAL - ftt == 2 */
 			/* SUBNORMAL - ftt == 2 */
-			case FSQRTS: TYPE(2,1,1,1,1,0,0); break;
-			case FSQRTD: TYPE(2,2,1,2,1,0,0); break;
 			case FADDD:
 			case FADDD:
 			case FSUBD:
 			case FSUBD:
 			case FMULD:
 			case FMULD:

+ 1 - 1
arch/sparc64/mm/Makefile

@@ -5,6 +5,6 @@
 EXTRA_AFLAGS := -ansi
 EXTRA_AFLAGS := -ansi
 EXTRA_CFLAGS := -Werror
 EXTRA_CFLAGS := -Werror
 
 
-obj-y    := ultra.o tlb.o fault.o init.o generic.o
+obj-y    := ultra.o tlb.o tsb.o fault.o init.o generic.o
 
 
 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o

+ 11 - 4
arch/sparc64/mm/fault.c

@@ -29,6 +29,7 @@
 #include <asm/lsu.h>
 #include <asm/lsu.h>
 #include <asm/sections.h>
 #include <asm/sections.h>
 #include <asm/kdebug.h>
 #include <asm/kdebug.h>
+#include <asm/mmu_context.h>
 
 
 /*
 /*
  * To debug kernel to catch accesses to certain virtual/physical addresses.
  * To debug kernel to catch accesses to certain virtual/physical addresses.
@@ -91,12 +92,13 @@ static void __kprobes unhandled_fault(unsigned long address,
 	die_if_kernel("Oops", regs);
 	die_if_kernel("Oops", regs);
 }
 }
 
 
-static void bad_kernel_pc(struct pt_regs *regs)
+static void bad_kernel_pc(struct pt_regs *regs, unsigned long vaddr)
 {
 {
 	unsigned long *ksp;
 	unsigned long *ksp;
 
 
 	printk(KERN_CRIT "OOPS: Bogus kernel PC [%016lx] in fault handler\n",
 	printk(KERN_CRIT "OOPS: Bogus kernel PC [%016lx] in fault handler\n",
 	       regs->tpc);
 	       regs->tpc);
+	printk(KERN_CRIT "OOPS: Fault was to vaddr[%lx]\n", vaddr);
 	__asm__("mov %%sp, %0" : "=r" (ksp));
 	__asm__("mov %%sp, %0" : "=r" (ksp));
 	show_stack(current, ksp);
 	show_stack(current, ksp);
 	unhandled_fault(regs->tpc, current, regs);
 	unhandled_fault(regs->tpc, current, regs);
@@ -137,7 +139,7 @@ static unsigned int get_user_insn(unsigned long tpc)
 	if (!pte_present(pte))
 	if (!pte_present(pte))
 		goto out;
 		goto out;
 
 
-	pa  = (pte_val(pte) & _PAGE_PADDR);
+	pa  = (pte_pfn(pte) << PAGE_SHIFT);
 	pa += (tpc & ~PAGE_MASK);
 	pa += (tpc & ~PAGE_MASK);
 
 
 	/* Use phys bypass so we don't pollute dtlb/dcache. */
 	/* Use phys bypass so we don't pollute dtlb/dcache. */
@@ -257,7 +259,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
 	struct vm_area_struct *vma;
 	struct vm_area_struct *vma;
 	unsigned int insn = 0;
 	unsigned int insn = 0;
 	int si_code, fault_code;
 	int si_code, fault_code;
-	unsigned long address;
+	unsigned long address, mm_rss;
 
 
 	fault_code = get_thread_fault_code();
 	fault_code = get_thread_fault_code();
 
 
@@ -280,7 +282,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
 		    (tpc >= MODULES_VADDR && tpc < MODULES_END)) {
 		    (tpc >= MODULES_VADDR && tpc < MODULES_END)) {
 			/* Valid, no problems... */
 			/* Valid, no problems... */
 		} else {
 		} else {
-			bad_kernel_pc(regs);
+			bad_kernel_pc(regs, address);
 			return;
 			return;
 		}
 		}
 	}
 	}
@@ -406,6 +408,11 @@ good_area:
 	}
 	}
 
 
 	up_read(&mm->mmap_sem);
 	up_read(&mm->mmap_sem);
+
+	mm_rss = get_mm_rss(mm);
+	if (unlikely(mm_rss >= mm->context.tsb_rss_limit))
+		tsb_grow(mm, mm_rss);
+
 	return;
 	return;
 
 
 	/*
 	/*

+ 18 - 22
arch/sparc64/mm/generic.c

@@ -15,15 +15,6 @@
 #include <asm/page.h>
 #include <asm/page.h>
 #include <asm/tlbflush.h>
 #include <asm/tlbflush.h>
 
 
-static inline pte_t mk_pte_io(unsigned long page, pgprot_t prot, int space)
-{
-	pte_t pte;
-	pte_val(pte) = (((page) | pgprot_val(prot) | _PAGE_E) &
-			~(unsigned long)_PAGE_CACHE);
-	pte_val(pte) |= (((unsigned long)space) << 32);
-	return pte;
-}
-
 /* Remap IO memory, the same way as remap_pfn_range(), but use
 /* Remap IO memory, the same way as remap_pfn_range(), but use
  * the obio memory space.
  * the obio memory space.
  *
  *
@@ -48,24 +39,29 @@ static inline void io_remap_pte_range(struct mm_struct *mm, pte_t * pte,
 		pte_t entry;
 		pte_t entry;
 		unsigned long curend = address + PAGE_SIZE;
 		unsigned long curend = address + PAGE_SIZE;
 		
 		
-		entry = mk_pte_io(offset, prot, space);
+		entry = mk_pte_io(offset, prot, space, PAGE_SIZE);
 		if (!(address & 0xffff)) {
 		if (!(address & 0xffff)) {
-			if (!(address & 0x3fffff) && !(offset & 0x3ffffe) && end >= address + 0x400000) {
-				entry = mk_pte_io(offset,
-						  __pgprot(pgprot_val (prot) | _PAGE_SZ4MB),
-						  space);
+			if (PAGE_SIZE < (4 * 1024 * 1024) &&
+			    !(address & 0x3fffff) &&
+			    !(offset & 0x3ffffe) &&
+			    end >= address + 0x400000) {
+				entry = mk_pte_io(offset, prot, space,
+						  4 * 1024 * 1024);
 				curend = address + 0x400000;
 				curend = address + 0x400000;
 				offset += 0x400000;
 				offset += 0x400000;
-			} else if (!(address & 0x7ffff) && !(offset & 0x7fffe) && end >= address + 0x80000) {
-				entry = mk_pte_io(offset,
-						  __pgprot(pgprot_val (prot) | _PAGE_SZ512K),
-						  space);
+			} else if (PAGE_SIZE < (512 * 1024) &&
+				   !(address & 0x7ffff) &&
+				   !(offset & 0x7fffe) &&
+				   end >= address + 0x80000) {
+				entry = mk_pte_io(offset, prot, space,
+						  512 * 1024 * 1024);
 				curend = address + 0x80000;
 				curend = address + 0x80000;
 				offset += 0x80000;
 				offset += 0x80000;
-			} else if (!(offset & 0xfffe) && end >= address + 0x10000) {
-				entry = mk_pte_io(offset,
-						  __pgprot(pgprot_val (prot) | _PAGE_SZ64K),
-						  space);
+			} else if (PAGE_SIZE < (64 * 1024) &&
+				   !(offset & 0xfffe) &&
+				   end >= address + 0x10000) {
+				entry = mk_pte_io(offset, prot, space,
+						  64 * 1024);
 				curend = address + 0x10000;
 				curend = address + 0x10000;
 				offset += 0x10000;
 				offset += 0x10000;
 			} else
 			} else

+ 175 - 4
arch/sparc64/mm/hugetlbpage.c

@@ -1,7 +1,7 @@
 /*
 /*
  * SPARC64 Huge TLB page support.
  * SPARC64 Huge TLB page support.
  *
  *
- * Copyright (C) 2002, 2003 David S. Miller (davem@redhat.com)
+ * Copyright (C) 2002, 2003, 2006 David S. Miller (davem@davemloft.net)
  */
  */
 
 
 #include <linux/config.h>
 #include <linux/config.h>
@@ -22,6 +22,175 @@
 #include <asm/cacheflush.h>
 #include <asm/cacheflush.h>
 #include <asm/mmu_context.h>
 #include <asm/mmu_context.h>
 
 
+/* Slightly simplified from the non-hugepage variant because by
+ * definition we don't have to worry about any page coloring stuff
+ */
+#define VA_EXCLUDE_START (0x0000080000000000UL - (1UL << 32UL))
+#define VA_EXCLUDE_END   (0xfffff80000000000UL + (1UL << 32UL))
+
+static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *filp,
+							unsigned long addr,
+							unsigned long len,
+							unsigned long pgoff,
+							unsigned long flags)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct * vma;
+	unsigned long task_size = TASK_SIZE;
+	unsigned long start_addr;
+
+	if (test_thread_flag(TIF_32BIT))
+		task_size = STACK_TOP32;
+	if (unlikely(len >= VA_EXCLUDE_START))
+		return -ENOMEM;
+
+	if (len > mm->cached_hole_size) {
+	        start_addr = addr = mm->free_area_cache;
+	} else {
+	        start_addr = addr = TASK_UNMAPPED_BASE;
+	        mm->cached_hole_size = 0;
+	}
+
+	task_size -= len;
+
+full_search:
+	addr = ALIGN(addr, HPAGE_SIZE);
+
+	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+		/* At this point:  (!vma || addr < vma->vm_end). */
+		if (addr < VA_EXCLUDE_START &&
+		    (addr + len) >= VA_EXCLUDE_START) {
+			addr = VA_EXCLUDE_END;
+			vma = find_vma(mm, VA_EXCLUDE_END);
+		}
+		if (unlikely(task_size < addr)) {
+			if (start_addr != TASK_UNMAPPED_BASE) {
+				start_addr = addr = TASK_UNMAPPED_BASE;
+				mm->cached_hole_size = 0;
+				goto full_search;
+			}
+			return -ENOMEM;
+		}
+		if (likely(!vma || addr + len <= vma->vm_start)) {
+			/*
+			 * Remember the place where we stopped the search:
+			 */
+			mm->free_area_cache = addr + len;
+			return addr;
+		}
+		if (addr + mm->cached_hole_size < vma->vm_start)
+		        mm->cached_hole_size = vma->vm_start - addr;
+
+		addr = ALIGN(vma->vm_end, HPAGE_SIZE);
+	}
+}
+
+static unsigned long
+hugetlb_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
+				  const unsigned long len,
+				  const unsigned long pgoff,
+				  const unsigned long flags)
+{
+	struct vm_area_struct *vma;
+	struct mm_struct *mm = current->mm;
+	unsigned long addr = addr0;
+
+	/* This should only ever run for 32-bit processes.  */
+	BUG_ON(!test_thread_flag(TIF_32BIT));
+
+	/* check if free_area_cache is useful for us */
+	if (len <= mm->cached_hole_size) {
+ 	        mm->cached_hole_size = 0;
+ 		mm->free_area_cache = mm->mmap_base;
+ 	}
+
+	/* either no address requested or can't fit in requested address hole */
+	addr = mm->free_area_cache & HPAGE_MASK;
+
+	/* make sure it can fit in the remaining address space */
+	if (likely(addr > len)) {
+		vma = find_vma(mm, addr-len);
+		if (!vma || addr <= vma->vm_start) {
+			/* remember the address as a hint for next time */
+			return (mm->free_area_cache = addr-len);
+		}
+	}
+
+	if (unlikely(mm->mmap_base < len))
+		goto bottomup;
+
+	addr = (mm->mmap_base-len) & HPAGE_MASK;
+
+	do {
+		/*
+		 * Lookup failure means no vma is above this address,
+		 * else if new region fits below vma->vm_start,
+		 * return with success:
+		 */
+		vma = find_vma(mm, addr);
+		if (likely(!vma || addr+len <= vma->vm_start)) {
+			/* remember the address as a hint for next time */
+			return (mm->free_area_cache = addr);
+		}
+
+ 		/* remember the largest hole we saw so far */
+ 		if (addr + mm->cached_hole_size < vma->vm_start)
+ 		        mm->cached_hole_size = vma->vm_start - addr;
+
+		/* try just below the current vma->vm_start */
+		addr = (vma->vm_start-len) & HPAGE_MASK;
+	} while (likely(len < vma->vm_start));
+
+bottomup:
+	/*
+	 * A failed mmap() very likely causes application failure,
+	 * so fall back to the bottom-up function here. This scenario
+	 * can happen with large stack limits and large mmap()
+	 * allocations.
+	 */
+	mm->cached_hole_size = ~0UL;
+  	mm->free_area_cache = TASK_UNMAPPED_BASE;
+	addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
+	/*
+	 * Restore the topdown base:
+	 */
+	mm->free_area_cache = mm->mmap_base;
+	mm->cached_hole_size = ~0UL;
+
+	return addr;
+}
+
+unsigned long
+hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+		unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	unsigned long task_size = TASK_SIZE;
+
+	if (test_thread_flag(TIF_32BIT))
+		task_size = STACK_TOP32;
+
+	if (len & ~HPAGE_MASK)
+		return -EINVAL;
+	if (len > task_size)
+		return -ENOMEM;
+
+	if (addr) {
+		addr = ALIGN(addr, HPAGE_SIZE);
+		vma = find_vma(mm, addr);
+		if (task_size - len >= addr &&
+		    (!vma || addr + len <= vma->vm_start))
+			return addr;
+	}
+	if (mm->get_unmapped_area == arch_get_unmapped_area)
+		return hugetlb_get_unmapped_area_bottomup(file, addr, len,
+				pgoff, flags);
+	else
+		return hugetlb_get_unmapped_area_topdown(file, addr, len,
+				pgoff, flags);
+}
+
 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
 {
 	pgd_t *pgd;
 	pgd_t *pgd;
@@ -48,12 +217,14 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 	pmd_t *pmd;
 	pmd_t *pmd;
 	pte_t *pte = NULL;
 	pte_t *pte = NULL;
 
 
+	addr &= HPAGE_MASK;
+
 	pgd = pgd_offset(mm, addr);
 	pgd = pgd_offset(mm, addr);
-	if (pgd) {
+	if (!pgd_none(*pgd)) {
 		pud = pud_offset(pgd, addr);
 		pud = pud_offset(pgd, addr);
-		if (pud) {
+		if (!pud_none(*pud)) {
 			pmd = pmd_offset(pud, addr);
 			pmd = pmd_offset(pud, addr);
-			if (pmd)
+			if (!pmd_none(*pmd))
 				pte = pte_offset_map(pmd, addr);
 				pte = pte_offset_map(pmd, addr);
 		}
 		}
 	}
 	}

File diff suppressed because it is too large
+ 245 - 603
arch/sparc64/mm/init.c


+ 4 - 60
arch/sparc64/mm/tlb.c

@@ -25,6 +25,8 @@ void flush_tlb_pending(void)
 	struct mmu_gather *mp = &__get_cpu_var(mmu_gathers);
 	struct mmu_gather *mp = &__get_cpu_var(mmu_gathers);
 
 
 	if (mp->tlb_nr) {
 	if (mp->tlb_nr) {
+		flush_tsb_user(mp);
+
 		if (CTX_VALID(mp->mm->context)) {
 		if (CTX_VALID(mp->mm->context)) {
 #ifdef CONFIG_SMP
 #ifdef CONFIG_SMP
 			smp_flush_tlb_pending(mp->mm, mp->tlb_nr,
 			smp_flush_tlb_pending(mp->mm, mp->tlb_nr,
@@ -47,7 +49,8 @@ void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, pte_t *ptep, pte_t
 	if (pte_exec(orig))
 	if (pte_exec(orig))
 		vaddr |= 0x1UL;
 		vaddr |= 0x1UL;
 
 
-	if (pte_dirty(orig)) {
+	if (tlb_type != hypervisor &&
+	    pte_dirty(orig)) {
 		unsigned long paddr, pfn = pte_pfn(orig);
 		unsigned long paddr, pfn = pte_pfn(orig);
 		struct address_space *mapping;
 		struct address_space *mapping;
 		struct page *page;
 		struct page *page;
@@ -89,62 +92,3 @@ no_cache_flush:
 	if (nr >= TLB_BATCH_NR)
 	if (nr >= TLB_BATCH_NR)
 		flush_tlb_pending();
 		flush_tlb_pending();
 }
 }
-
-void flush_tlb_pgtables(struct mm_struct *mm, unsigned long start, unsigned long end)
-{
-	struct mmu_gather *mp = &__get_cpu_var(mmu_gathers);
-	unsigned long nr = mp->tlb_nr;
-	long s = start, e = end, vpte_base;
-
-	if (mp->fullmm)
-		return;
-
-	/* If start is greater than end, that is a real problem.  */
-	BUG_ON(start > end);
-
-	/* However, straddling the VA space hole is quite normal. */
-	s &= PMD_MASK;
-	e = (e + PMD_SIZE - 1) & PMD_MASK;
-
-	vpte_base = (tlb_type == spitfire ?
-		     VPTE_BASE_SPITFIRE :
-		     VPTE_BASE_CHEETAH);
-
-	if (unlikely(nr != 0 && mm != mp->mm)) {
-		flush_tlb_pending();
-		nr = 0;
-	}
-
-	if (nr == 0)
-		mp->mm = mm;
-
-	start = vpte_base + (s >> (PAGE_SHIFT - 3));
-	end = vpte_base + (e >> (PAGE_SHIFT - 3));
-
-	/* If the request straddles the VA space hole, we
-	 * need to swap start and end.  The reason this
-	 * occurs is that "vpte_base" is the center of
-	 * the linear page table mapping area.  Thus,
-	 * high addresses with the sign bit set map to
-	 * addresses below vpte_base and non-sign bit
-	 * addresses map to addresses above vpte_base.
-	 */
-	if (end < start) {
-		unsigned long tmp = start;
-
-		start = end;
-		end = tmp;
-	}
-
-	while (start < end) {
-		mp->vaddrs[nr] = start;
-		mp->tlb_nr = ++nr;
-		if (nr >= TLB_BATCH_NR) {
-			flush_tlb_pending();
-			nr = 0;
-		}
-		start += PAGE_SIZE;
-	}
-	if (nr)
-		flush_tlb_pending();
-}

+ 440 - 0
arch/sparc64/mm/tsb.c

@@ -0,0 +1,440 @@
+/* arch/sparc64/mm/tsb.c
+ *
+ * Copyright (C) 2006 David S. Miller <davem@davemloft.net>
+ */
+
+#include <linux/kernel.h>
+#include <asm/system.h>
+#include <asm/page.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/tsb.h>
+#include <asm/oplib.h>
+
+extern struct tsb swapper_tsb[KERNEL_TSB_NENTRIES];
+
+static inline unsigned long tsb_hash(unsigned long vaddr, unsigned long nentries)
+{
+	vaddr >>= PAGE_SHIFT;
+	return vaddr & (nentries - 1);
+}
+
+static inline int tag_compare(unsigned long tag, unsigned long vaddr)
+{
+	return (tag == (vaddr >> 22));
+}
+
+/* TSB flushes need only occur on the processor initiating the address
+ * space modification, not on each cpu the address space has run on.
+ * Only the TLB flush needs that treatment.
+ */
+
+void flush_tsb_kernel_range(unsigned long start, unsigned long end)
+{
+	unsigned long v;
+
+	for (v = start; v < end; v += PAGE_SIZE) {
+		unsigned long hash = tsb_hash(v, KERNEL_TSB_NENTRIES);
+		struct tsb *ent = &swapper_tsb[hash];
+
+		if (tag_compare(ent->tag, v)) {
+			ent->tag = (1UL << TSB_TAG_INVALID_BIT);
+			membar_storeload_storestore();
+		}
+	}
+}
+
+void flush_tsb_user(struct mmu_gather *mp)
+{
+	struct mm_struct *mm = mp->mm;
+	unsigned long nentries, base, flags;
+	struct tsb *tsb;
+	int i;
+
+	spin_lock_irqsave(&mm->context.lock, flags);
+
+	tsb = mm->context.tsb;
+	nentries = mm->context.tsb_nentries;
+
+	if (tlb_type == cheetah_plus || tlb_type == hypervisor)
+		base = __pa(tsb);
+	else
+		base = (unsigned long) tsb;
+	
+	for (i = 0; i < mp->tlb_nr; i++) {
+		unsigned long v = mp->vaddrs[i];
+		unsigned long tag, ent, hash;
+
+		v &= ~0x1UL;
+
+		hash = tsb_hash(v, nentries);
+		ent = base + (hash * sizeof(struct tsb));
+		tag = (v >> 22UL);
+
+		tsb_flush(ent, tag);
+	}
+
+	spin_unlock_irqrestore(&mm->context.lock, flags);
+}
+
+static void setup_tsb_params(struct mm_struct *mm, unsigned long tsb_bytes)
+{
+	unsigned long tsb_reg, base, tsb_paddr;
+	unsigned long page_sz, tte;
+
+	mm->context.tsb_nentries = tsb_bytes / sizeof(struct tsb);
+
+	base = TSBMAP_BASE;
+	tte = pgprot_val(PAGE_KERNEL_LOCKED);
+	tsb_paddr = __pa(mm->context.tsb);
+	BUG_ON(tsb_paddr & (tsb_bytes - 1UL));
+
+	/* Use the smallest page size that can map the whole TSB
+	 * in one TLB entry.
+	 */
+	switch (tsb_bytes) {
+	case 8192 << 0:
+		tsb_reg = 0x0UL;
+#ifdef DCACHE_ALIASING_POSSIBLE
+		base += (tsb_paddr & 8192);
+#endif
+		page_sz = 8192;
+		break;
+
+	case 8192 << 1:
+		tsb_reg = 0x1UL;
+		page_sz = 64 * 1024;
+		break;
+
+	case 8192 << 2:
+		tsb_reg = 0x2UL;
+		page_sz = 64 * 1024;
+		break;
+
+	case 8192 << 3:
+		tsb_reg = 0x3UL;
+		page_sz = 64 * 1024;
+		break;
+
+	case 8192 << 4:
+		tsb_reg = 0x4UL;
+		page_sz = 512 * 1024;
+		break;
+
+	case 8192 << 5:
+		tsb_reg = 0x5UL;
+		page_sz = 512 * 1024;
+		break;
+
+	case 8192 << 6:
+		tsb_reg = 0x6UL;
+		page_sz = 512 * 1024;
+		break;
+
+	case 8192 << 7:
+		tsb_reg = 0x7UL;
+		page_sz = 4 * 1024 * 1024;
+		break;
+
+	default:
+		BUG();
+	};
+	tte |= pte_sz_bits(page_sz);
+
+	if (tlb_type == cheetah_plus || tlb_type == hypervisor) {
+		/* Physical mapping, no locked TLB entry for TSB.  */
+		tsb_reg |= tsb_paddr;
+
+		mm->context.tsb_reg_val = tsb_reg;
+		mm->context.tsb_map_vaddr = 0;
+		mm->context.tsb_map_pte = 0;
+	} else {
+		tsb_reg |= base;
+		tsb_reg |= (tsb_paddr & (page_sz - 1UL));
+		tte |= (tsb_paddr & ~(page_sz - 1UL));
+
+		mm->context.tsb_reg_val = tsb_reg;
+		mm->context.tsb_map_vaddr = base;
+		mm->context.tsb_map_pte = tte;
+	}
+
+	/* Setup the Hypervisor TSB descriptor.  */
+	if (tlb_type == hypervisor) {
+		struct hv_tsb_descr *hp = &mm->context.tsb_descr;
+
+		switch (PAGE_SIZE) {
+		case 8192:
+		default:
+			hp->pgsz_idx = HV_PGSZ_IDX_8K;
+			break;
+
+		case 64 * 1024:
+			hp->pgsz_idx = HV_PGSZ_IDX_64K;
+			break;
+
+		case 512 * 1024:
+			hp->pgsz_idx = HV_PGSZ_IDX_512K;
+			break;
+
+		case 4 * 1024 * 1024:
+			hp->pgsz_idx = HV_PGSZ_IDX_4MB;
+			break;
+		};
+		hp->assoc = 1;
+		hp->num_ttes = tsb_bytes / 16;
+		hp->ctx_idx = 0;
+		switch (PAGE_SIZE) {
+		case 8192:
+		default:
+			hp->pgsz_mask = HV_PGSZ_MASK_8K;
+			break;
+
+		case 64 * 1024:
+			hp->pgsz_mask = HV_PGSZ_MASK_64K;
+			break;
+
+		case 512 * 1024:
+			hp->pgsz_mask = HV_PGSZ_MASK_512K;
+			break;
+
+		case 4 * 1024 * 1024:
+			hp->pgsz_mask = HV_PGSZ_MASK_4MB;
+			break;
+		};
+		hp->tsb_base = tsb_paddr;
+		hp->resv = 0;
+	}
+}
+
+static kmem_cache_t *tsb_caches[8] __read_mostly;
+
+static const char *tsb_cache_names[8] = {
+	"tsb_8KB",
+	"tsb_16KB",
+	"tsb_32KB",
+	"tsb_64KB",
+	"tsb_128KB",
+	"tsb_256KB",
+	"tsb_512KB",
+	"tsb_1MB",
+};
+
+void __init tsb_cache_init(void)
+{
+	unsigned long i;
+
+	for (i = 0; i < 8; i++) {
+		unsigned long size = 8192 << i;
+		const char *name = tsb_cache_names[i];
+
+		tsb_caches[i] = kmem_cache_create(name,
+						  size, size,
+						  SLAB_HWCACHE_ALIGN |
+						  SLAB_MUST_HWCACHE_ALIGN,
+						  NULL, NULL);
+		if (!tsb_caches[i]) {
+			prom_printf("Could not create %s cache\n", name);
+			prom_halt();
+		}
+	}
+}
+
+/* When the RSS of an address space exceeds mm->context.tsb_rss_limit,
+ * do_sparc64_fault() invokes this routine to try and grow the TSB.
+ *
+ * When we reach the maximum TSB size supported, we stick ~0UL into
+ * mm->context.tsb_rss_limit so the grow checks in update_mmu_cache()
+ * will not trigger any longer.
+ *
+ * The TSB can be anywhere from 8K to 1MB in size, in increasing powers
+ * of two.  The TSB must be aligned to it's size, so f.e. a 512K TSB
+ * must be 512K aligned.  It also must be physically contiguous, so we
+ * cannot use vmalloc().
+ *
+ * The idea here is to grow the TSB when the RSS of the process approaches
+ * the number of entries that the current TSB can hold at once.  Currently,
+ * we trigger when the RSS hits 3/4 of the TSB capacity.
+ */
+void tsb_grow(struct mm_struct *mm, unsigned long rss)
+{
+	unsigned long max_tsb_size = 1 * 1024 * 1024;
+	unsigned long new_size, old_size, flags;
+	struct tsb *old_tsb, *new_tsb;
+	unsigned long new_cache_index, old_cache_index;
+	unsigned long new_rss_limit;
+	gfp_t gfp_flags;
+
+	if (max_tsb_size > (PAGE_SIZE << MAX_ORDER))
+		max_tsb_size = (PAGE_SIZE << MAX_ORDER);
+
+	new_cache_index = 0;
+	for (new_size = 8192; new_size < max_tsb_size; new_size <<= 1UL) {
+		unsigned long n_entries = new_size / sizeof(struct tsb);
+
+		n_entries = (n_entries * 3) / 4;
+		if (n_entries > rss)
+			break;
+
+		new_cache_index++;
+	}
+
+	if (new_size == max_tsb_size)
+		new_rss_limit = ~0UL;
+	else
+		new_rss_limit = ((new_size / sizeof(struct tsb)) * 3) / 4;
+
+retry_tsb_alloc:
+	gfp_flags = GFP_KERNEL;
+	if (new_size > (PAGE_SIZE * 2))
+		gfp_flags = __GFP_NOWARN | __GFP_NORETRY;
+
+	new_tsb = kmem_cache_alloc(tsb_caches[new_cache_index], gfp_flags);
+	if (unlikely(!new_tsb)) {
+		/* Not being able to fork due to a high-order TSB
+		 * allocation failure is very bad behavior.  Just back
+		 * down to a 0-order allocation and force no TSB
+		 * growing for this address space.
+		 */
+		if (mm->context.tsb == NULL && new_cache_index > 0) {
+			new_cache_index = 0;
+			new_size = 8192;
+			new_rss_limit = ~0UL;
+			goto retry_tsb_alloc;
+		}
+
+		/* If we failed on a TSB grow, we are under serious
+		 * memory pressure so don't try to grow any more.
+		 */
+		if (mm->context.tsb != NULL)
+			mm->context.tsb_rss_limit = ~0UL;
+		return;
+	}
+
+	/* Mark all tags as invalid.  */
+	tsb_init(new_tsb, new_size);
+
+	/* Ok, we are about to commit the changes.  If we are
+	 * growing an existing TSB the locking is very tricky,
+	 * so WATCH OUT!
+	 *
+	 * We have to hold mm->context.lock while committing to the
+	 * new TSB, this synchronizes us with processors in
+	 * flush_tsb_user() and switch_mm() for this address space.
+	 *
+	 * But even with that lock held, processors run asynchronously
+	 * accessing the old TSB via TLB miss handling.  This is OK
+	 * because those actions are just propagating state from the
+	 * Linux page tables into the TSB, page table mappings are not
+	 * being changed.  If a real fault occurs, the processor will
+	 * synchronize with us when it hits flush_tsb_user(), this is
+	 * also true for the case where vmscan is modifying the page
+	 * tables.  The only thing we need to be careful with is to
+	 * skip any locked TSB entries during copy_tsb().
+	 *
+	 * When we finish committing to the new TSB, we have to drop
+	 * the lock and ask all other cpus running this address space
+	 * to run tsb_context_switch() to see the new TSB table.
+	 */
+	spin_lock_irqsave(&mm->context.lock, flags);
+
+	old_tsb = mm->context.tsb;
+	old_cache_index = (mm->context.tsb_reg_val & 0x7UL);
+	old_size = mm->context.tsb_nentries * sizeof(struct tsb);
+
+
+	/* Handle multiple threads trying to grow the TSB at the same time.
+	 * One will get in here first, and bump the size and the RSS limit.
+	 * The others will get in here next and hit this check.
+	 */
+	if (unlikely(old_tsb && (rss < mm->context.tsb_rss_limit))) {
+		spin_unlock_irqrestore(&mm->context.lock, flags);
+
+		kmem_cache_free(tsb_caches[new_cache_index], new_tsb);
+		return;
+	}
+
+	mm->context.tsb_rss_limit = new_rss_limit;
+
+	if (old_tsb) {
+		extern void copy_tsb(unsigned long old_tsb_base,
+				     unsigned long old_tsb_size,
+				     unsigned long new_tsb_base,
+				     unsigned long new_tsb_size);
+		unsigned long old_tsb_base = (unsigned long) old_tsb;
+		unsigned long new_tsb_base = (unsigned long) new_tsb;
+
+		if (tlb_type == cheetah_plus || tlb_type == hypervisor) {
+			old_tsb_base = __pa(old_tsb_base);
+			new_tsb_base = __pa(new_tsb_base);
+		}
+		copy_tsb(old_tsb_base, old_size, new_tsb_base, new_size);
+	}
+
+	mm->context.tsb = new_tsb;
+	setup_tsb_params(mm, new_size);
+
+	spin_unlock_irqrestore(&mm->context.lock, flags);
+
+	/* If old_tsb is NULL, we're being invoked for the first time
+	 * from init_new_context().
+	 */
+	if (old_tsb) {
+		/* Reload it on the local cpu.  */
+		tsb_context_switch(mm);
+
+		/* Now force other processors to do the same.  */
+		smp_tsb_sync(mm);
+
+		/* Now it is safe to free the old tsb.  */
+		kmem_cache_free(tsb_caches[old_cache_index], old_tsb);
+	}
+}
+
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+{
+	spin_lock_init(&mm->context.lock);
+
+	mm->context.sparc64_ctx_val = 0UL;
+
+	/* copy_mm() copies over the parent's mm_struct before calling
+	 * us, so we need to zero out the TSB pointer or else tsb_grow()
+	 * will be confused and think there is an older TSB to free up.
+	 */
+	mm->context.tsb = NULL;
+
+	/* If this is fork, inherit the parent's TSB size.  We would
+	 * grow it to that size on the first page fault anyways.
+	 */
+	tsb_grow(mm, get_mm_rss(mm));
+
+	if (unlikely(!mm->context.tsb))
+		return -ENOMEM;
+
+	return 0;
+}
+
+void destroy_context(struct mm_struct *mm)
+{
+	unsigned long flags, cache_index;
+
+	cache_index = (mm->context.tsb_reg_val & 0x7UL);
+	kmem_cache_free(tsb_caches[cache_index], mm->context.tsb);
+
+	/* We can remove these later, but for now it's useful
+	 * to catch any bogus post-destroy_context() references
+	 * to the TSB.
+	 */
+	mm->context.tsb = NULL;
+	mm->context.tsb_reg_val = 0UL;
+
+	spin_lock_irqsave(&ctx_alloc_lock, flags);
+
+	if (CTX_VALID(mm->context)) {
+		unsigned long nr = CTX_NRBITS(mm->context);
+		mmu_context_bmap[nr>>6] &= ~(1UL << (nr & 63));
+	}
+
+	spin_unlock_irqrestore(&ctx_alloc_lock, flags);
+}

+ 293 - 81
arch/sparc64/mm/ultra.S

@@ -15,6 +15,7 @@
 #include <asm/head.h>
 #include <asm/head.h>
 #include <asm/thread_info.h>
 #include <asm/thread_info.h>
 #include <asm/cacheflush.h>
 #include <asm/cacheflush.h>
+#include <asm/hypervisor.h>
 
 
 	/* Basically, most of the Spitfire vs. Cheetah madness
 	/* Basically, most of the Spitfire vs. Cheetah madness
 	 * has to do with the fact that Cheetah does not support
 	 * has to do with the fact that Cheetah does not support
@@ -29,16 +30,18 @@
 	.text
 	.text
 	.align		32
 	.align		32
 	.globl		__flush_tlb_mm
 	.globl		__flush_tlb_mm
-__flush_tlb_mm: /* %o0=(ctx & TAG_CONTEXT_BITS), %o1=SECONDARY_CONTEXT */
+__flush_tlb_mm:		/* 18 insns */
+	/* %o0=(ctx & TAG_CONTEXT_BITS), %o1=SECONDARY_CONTEXT */
 	ldxa		[%o1] ASI_DMMU, %g2
 	ldxa		[%o1] ASI_DMMU, %g2
 	cmp		%g2, %o0
 	cmp		%g2, %o0
 	bne,pn		%icc, __spitfire_flush_tlb_mm_slow
 	bne,pn		%icc, __spitfire_flush_tlb_mm_slow
 	 mov		0x50, %g3
 	 mov		0x50, %g3
 	stxa		%g0, [%g3] ASI_DMMU_DEMAP
 	stxa		%g0, [%g3] ASI_DMMU_DEMAP
 	stxa		%g0, [%g3] ASI_IMMU_DEMAP
 	stxa		%g0, [%g3] ASI_IMMU_DEMAP
+	sethi		%hi(KERNBASE), %g3
+	flush		%g3
 	retl
 	retl
-	 flush		%g6
-	nop
+	 nop
 	nop
 	nop
 	nop
 	nop
 	nop
 	nop
@@ -51,7 +54,7 @@ __flush_tlb_mm: /* %o0=(ctx & TAG_CONTEXT_BITS), %o1=SECONDARY_CONTEXT */
 
 
 	.align		32
 	.align		32
 	.globl		__flush_tlb_pending
 	.globl		__flush_tlb_pending
-__flush_tlb_pending:
+__flush_tlb_pending:	/* 26 insns */
 	/* %o0 = context, %o1 = nr, %o2 = vaddrs[] */
 	/* %o0 = context, %o1 = nr, %o2 = vaddrs[] */
 	rdpr		%pstate, %g7
 	rdpr		%pstate, %g7
 	sllx		%o1, 3, %o1
 	sllx		%o1, 3, %o1
@@ -72,7 +75,8 @@ __flush_tlb_pending:
 	brnz,pt		%o1, 1b
 	brnz,pt		%o1, 1b
 	 nop
 	 nop
 	stxa		%g2, [%o4] ASI_DMMU
 	stxa		%g2, [%o4] ASI_DMMU
-	flush		%g6
+	sethi		%hi(KERNBASE), %o4
+	flush		%o4
 	retl
 	retl
 	 wrpr		%g7, 0x0, %pstate
 	 wrpr		%g7, 0x0, %pstate
 	nop
 	nop
@@ -82,7 +86,8 @@ __flush_tlb_pending:
 
 
 	.align		32
 	.align		32
 	.globl		__flush_tlb_kernel_range
 	.globl		__flush_tlb_kernel_range
-__flush_tlb_kernel_range:	/* %o0=start, %o1=end */
+__flush_tlb_kernel_range:	/* 16 insns */
+	/* %o0=start, %o1=end */
 	cmp		%o0, %o1
 	cmp		%o0, %o1
 	be,pn		%xcc, 2f
 	be,pn		%xcc, 2f
 	 sethi		%hi(PAGE_SIZE), %o4
 	 sethi		%hi(PAGE_SIZE), %o4
@@ -94,8 +99,11 @@ __flush_tlb_kernel_range:	/* %o0=start, %o1=end */
 	membar		#Sync
 	membar		#Sync
 	brnz,pt		%o3, 1b
 	brnz,pt		%o3, 1b
 	 sub		%o3, %o4, %o3
 	 sub		%o3, %o4, %o3
-2:	retl
-	 flush		%g6
+2:	sethi		%hi(KERNBASE), %o3
+	flush		%o3
+	retl
+	 nop
+	nop
 
 
 __spitfire_flush_tlb_mm_slow:
 __spitfire_flush_tlb_mm_slow:
 	rdpr		%pstate, %g1
 	rdpr		%pstate, %g1
@@ -105,7 +113,8 @@ __spitfire_flush_tlb_mm_slow:
 	stxa		%g0, [%g3] ASI_IMMU_DEMAP
 	stxa		%g0, [%g3] ASI_IMMU_DEMAP
 	flush		%g6
 	flush		%g6
 	stxa		%g2, [%o1] ASI_DMMU
 	stxa		%g2, [%o1] ASI_DMMU
-	flush		%g6
+	sethi		%hi(KERNBASE), %o1
+	flush		%o1
 	retl
 	retl
 	 wrpr		%g1, 0, %pstate
 	 wrpr		%g1, 0, %pstate
 
 
@@ -181,7 +190,7 @@ __flush_dcache_page:	/* %o0=kaddr, %o1=flush_icache */
 	.previous
 	.previous
 
 
 	/* Cheetah specific versions, patched at boot time. */
 	/* Cheetah specific versions, patched at boot time. */
-__cheetah_flush_tlb_mm: /* 18 insns */
+__cheetah_flush_tlb_mm: /* 19 insns */
 	rdpr		%pstate, %g7
 	rdpr		%pstate, %g7
 	andn		%g7, PSTATE_IE, %g2
 	andn		%g7, PSTATE_IE, %g2
 	wrpr		%g2, 0x0, %pstate
 	wrpr		%g2, 0x0, %pstate
@@ -196,12 +205,13 @@ __cheetah_flush_tlb_mm: /* 18 insns */
 	stxa		%g0, [%g3] ASI_DMMU_DEMAP
 	stxa		%g0, [%g3] ASI_DMMU_DEMAP
 	stxa		%g0, [%g3] ASI_IMMU_DEMAP
 	stxa		%g0, [%g3] ASI_IMMU_DEMAP
 	stxa		%g2, [%o2] ASI_DMMU
 	stxa		%g2, [%o2] ASI_DMMU
-	flush		%g6
+	sethi		%hi(KERNBASE), %o2
+	flush		%o2
 	wrpr		%g0, 0, %tl
 	wrpr		%g0, 0, %tl
 	retl
 	retl
 	 wrpr		%g7, 0x0, %pstate
 	 wrpr		%g7, 0x0, %pstate
 
 
-__cheetah_flush_tlb_pending:	/* 26 insns */
+__cheetah_flush_tlb_pending:	/* 27 insns */
 	/* %o0 = context, %o1 = nr, %o2 = vaddrs[] */
 	/* %o0 = context, %o1 = nr, %o2 = vaddrs[] */
 	rdpr		%pstate, %g7
 	rdpr		%pstate, %g7
 	sllx		%o1, 3, %o1
 	sllx		%o1, 3, %o1
@@ -225,7 +235,8 @@ __cheetah_flush_tlb_pending:	/* 26 insns */
 	brnz,pt		%o1, 1b
 	brnz,pt		%o1, 1b
 	 nop
 	 nop
 	stxa		%g2, [%o4] ASI_DMMU
 	stxa		%g2, [%o4] ASI_DMMU
-	flush		%g6
+	sethi		%hi(KERNBASE), %o4
+	flush		%o4
 	wrpr		%g0, 0, %tl
 	wrpr		%g0, 0, %tl
 	retl
 	retl
 	 wrpr		%g7, 0x0, %pstate
 	 wrpr		%g7, 0x0, %pstate
@@ -245,7 +256,76 @@ __cheetah_flush_dcache_page: /* 11 insns */
 	 nop
 	 nop
 #endif /* DCACHE_ALIASING_POSSIBLE */
 #endif /* DCACHE_ALIASING_POSSIBLE */
 
 
-cheetah_patch_one:
+	/* Hypervisor specific versions, patched at boot time.  */
+__hypervisor_tlb_tl0_error:
+	save		%sp, -192, %sp
+	mov		%i0, %o0
+	call		hypervisor_tlbop_error
+	 mov		%i1, %o1
+	ret
+	 restore
+
+__hypervisor_flush_tlb_mm: /* 10 insns */
+	mov		%o0, %o2	/* ARG2: mmu context */
+	mov		0, %o0		/* ARG0: CPU lists unimplemented */
+	mov		0, %o1		/* ARG1: CPU lists unimplemented */
+	mov		HV_MMU_ALL, %o3	/* ARG3: flags */
+	mov		HV_FAST_MMU_DEMAP_CTX, %o5
+	ta		HV_FAST_TRAP
+	brnz,pn		%o0, __hypervisor_tlb_tl0_error
+	 mov		HV_FAST_MMU_DEMAP_CTX, %o1
+	retl
+	 nop
+
+__hypervisor_flush_tlb_pending: /* 16 insns */
+	/* %o0 = context, %o1 = nr, %o2 = vaddrs[] */
+	sllx		%o1, 3, %g1
+	mov		%o2, %g2
+	mov		%o0, %g3
+1:	sub		%g1, (1 << 3), %g1
+	ldx		[%g2 + %g1], %o0      /* ARG0: vaddr + IMMU-bit */
+	mov		%g3, %o1	      /* ARG1: mmu context */
+	mov		HV_MMU_ALL, %o2	      /* ARG2: flags */
+	srlx		%o0, PAGE_SHIFT, %o0
+	sllx		%o0, PAGE_SHIFT, %o0
+	ta		HV_MMU_UNMAP_ADDR_TRAP
+	brnz,pn		%o0, __hypervisor_tlb_tl0_error
+	 mov		HV_MMU_UNMAP_ADDR_TRAP, %o1
+	brnz,pt		%g1, 1b
+	 nop
+	retl
+	 nop
+
+__hypervisor_flush_tlb_kernel_range: /* 16 insns */
+	/* %o0=start, %o1=end */
+	cmp		%o0, %o1
+	be,pn		%xcc, 2f
+	 sethi		%hi(PAGE_SIZE), %g3
+	mov		%o0, %g1
+	sub		%o1, %g1, %g2
+	sub		%g2, %g3, %g2
+1:	add		%g1, %g2, %o0	/* ARG0: virtual address */
+	mov		0, %o1		/* ARG1: mmu context */
+	mov		HV_MMU_ALL, %o2	/* ARG2: flags */
+	ta		HV_MMU_UNMAP_ADDR_TRAP
+	brnz,pn		%o0, __hypervisor_tlb_tl0_error
+	 mov		HV_MMU_UNMAP_ADDR_TRAP, %o1
+	brnz,pt		%g2, 1b
+	 sub		%g2, %g3, %g2
+2:	retl
+	 nop
+
+#ifdef DCACHE_ALIASING_POSSIBLE
+	/* XXX Niagara and friends have an 8K cache, so no aliasing is
+	 * XXX possible, but nothing explicit in the Hypervisor API
+	 * XXX guarantees this.
+	 */
+__hypervisor_flush_dcache_page:	/* 2 insns */
+	retl
+	 nop
+#endif
+
+tlb_patch_one:
 1:	lduw		[%o1], %g1
 1:	lduw		[%o1], %g1
 	stw		%g1, [%o0]
 	stw		%g1, [%o0]
 	flush		%o0
 	flush		%o0
@@ -264,22 +344,22 @@ cheetah_patch_cachetlbops:
 	or		%o0, %lo(__flush_tlb_mm), %o0
 	or		%o0, %lo(__flush_tlb_mm), %o0
 	sethi		%hi(__cheetah_flush_tlb_mm), %o1
 	sethi		%hi(__cheetah_flush_tlb_mm), %o1
 	or		%o1, %lo(__cheetah_flush_tlb_mm), %o1
 	or		%o1, %lo(__cheetah_flush_tlb_mm), %o1
-	call		cheetah_patch_one
-	 mov		18, %o2
+	call		tlb_patch_one
+	 mov		19, %o2
 
 
 	sethi		%hi(__flush_tlb_pending), %o0
 	sethi		%hi(__flush_tlb_pending), %o0
 	or		%o0, %lo(__flush_tlb_pending), %o0
 	or		%o0, %lo(__flush_tlb_pending), %o0
 	sethi		%hi(__cheetah_flush_tlb_pending), %o1
 	sethi		%hi(__cheetah_flush_tlb_pending), %o1
 	or		%o1, %lo(__cheetah_flush_tlb_pending), %o1
 	or		%o1, %lo(__cheetah_flush_tlb_pending), %o1
-	call		cheetah_patch_one
-	 mov		26, %o2
+	call		tlb_patch_one
+	 mov		27, %o2
 
 
 #ifdef DCACHE_ALIASING_POSSIBLE
 #ifdef DCACHE_ALIASING_POSSIBLE
 	sethi		%hi(__flush_dcache_page), %o0
 	sethi		%hi(__flush_dcache_page), %o0
 	or		%o0, %lo(__flush_dcache_page), %o0
 	or		%o0, %lo(__flush_dcache_page), %o0
 	sethi		%hi(__cheetah_flush_dcache_page), %o1
 	sethi		%hi(__cheetah_flush_dcache_page), %o1
 	or		%o1, %lo(__cheetah_flush_dcache_page), %o1
 	or		%o1, %lo(__cheetah_flush_dcache_page), %o1
-	call		cheetah_patch_one
+	call		tlb_patch_one
 	 mov		11, %o2
 	 mov		11, %o2
 #endif /* DCACHE_ALIASING_POSSIBLE */
 #endif /* DCACHE_ALIASING_POSSIBLE */
 
 
@@ -295,16 +375,14 @@ cheetah_patch_cachetlbops:
 	 *   %g1	address arg 1	(tlb page and range flushes)
 	 *   %g1	address arg 1	(tlb page and range flushes)
 	 *   %g7	address arg 2	(tlb range flush only)
 	 *   %g7	address arg 2	(tlb range flush only)
 	 *
 	 *
-	 *   %g6	ivector table, don't touch
-	 *   %g2	scratch 1
-	 *   %g3	scratch 2
-	 *   %g4	scratch 3
-	 *
-	 * TODO: Make xcall TLB range flushes use the tricks above... -DaveM
+	 *   %g6	scratch 1
+	 *   %g2	scratch 2
+	 *   %g3	scratch 3
+	 *   %g4	scratch 4
 	 */
 	 */
 	.align		32
 	.align		32
 	.globl		xcall_flush_tlb_mm
 	.globl		xcall_flush_tlb_mm
-xcall_flush_tlb_mm:
+xcall_flush_tlb_mm:	/* 21 insns */
 	mov		PRIMARY_CONTEXT, %g2
 	mov		PRIMARY_CONTEXT, %g2
 	ldxa		[%g2] ASI_DMMU, %g3
 	ldxa		[%g2] ASI_DMMU, %g3
 	srlx		%g3, CTX_PGSZ1_NUC_SHIFT, %g4
 	srlx		%g3, CTX_PGSZ1_NUC_SHIFT, %g4
@@ -316,9 +394,19 @@ xcall_flush_tlb_mm:
 	stxa		%g0, [%g4] ASI_IMMU_DEMAP
 	stxa		%g0, [%g4] ASI_IMMU_DEMAP
 	stxa		%g3, [%g2] ASI_DMMU
 	stxa		%g3, [%g2] ASI_DMMU
 	retry
 	retry
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
 
 
 	.globl		xcall_flush_tlb_pending
 	.globl		xcall_flush_tlb_pending
-xcall_flush_tlb_pending:
+xcall_flush_tlb_pending:	/* 21 insns */
 	/* %g5=context, %g1=nr, %g7=vaddrs[] */
 	/* %g5=context, %g1=nr, %g7=vaddrs[] */
 	sllx		%g1, 3, %g1
 	sllx		%g1, 3, %g1
 	mov		PRIMARY_CONTEXT, %g4
 	mov		PRIMARY_CONTEXT, %g4
@@ -341,9 +429,10 @@ xcall_flush_tlb_pending:
 	 nop
 	 nop
 	stxa		%g2, [%g4] ASI_DMMU
 	stxa		%g2, [%g4] ASI_DMMU
 	retry
 	retry
+	nop
 
 
 	.globl		xcall_flush_tlb_kernel_range
 	.globl		xcall_flush_tlb_kernel_range
-xcall_flush_tlb_kernel_range:
+xcall_flush_tlb_kernel_range:	/* 25 insns */
 	sethi		%hi(PAGE_SIZE - 1), %g2
 	sethi		%hi(PAGE_SIZE - 1), %g2
 	or		%g2, %lo(PAGE_SIZE - 1), %g2
 	or		%g2, %lo(PAGE_SIZE - 1), %g2
 	andn		%g1, %g2, %g1
 	andn		%g1, %g2, %g1
@@ -360,14 +449,30 @@ xcall_flush_tlb_kernel_range:
 	retry
 	retry
 	nop
 	nop
 	nop
 	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
 
 
 	/* This runs in a very controlled environment, so we do
 	/* This runs in a very controlled environment, so we do
 	 * not need to worry about BH races etc.
 	 * not need to worry about BH races etc.
 	 */
 	 */
 	.globl		xcall_sync_tick
 	.globl		xcall_sync_tick
 xcall_sync_tick:
 xcall_sync_tick:
-	rdpr		%pstate, %g2
+
+661:	rdpr		%pstate, %g2
 	wrpr		%g2, PSTATE_IG | PSTATE_AG, %pstate
 	wrpr		%g2, PSTATE_IG | PSTATE_AG, %pstate
+	.section	.sun4v_2insn_patch, "ax"
+	.word		661b
+	nop
+	nop
+	.previous
+
 	rdpr		%pil, %g2
 	rdpr		%pil, %g2
 	wrpr		%g0, 15, %pil
 	wrpr		%g0, 15, %pil
 	sethi		%hi(109f), %g7
 	sethi		%hi(109f), %g7
@@ -390,8 +495,15 @@ xcall_sync_tick:
 	 */
 	 */
 	.globl		xcall_report_regs
 	.globl		xcall_report_regs
 xcall_report_regs:
 xcall_report_regs:
-	rdpr		%pstate, %g2
+
+661:	rdpr		%pstate, %g2
 	wrpr		%g2, PSTATE_IG | PSTATE_AG, %pstate
 	wrpr		%g2, PSTATE_IG | PSTATE_AG, %pstate
+	.section	.sun4v_2insn_patch, "ax"
+	.word		661b
+	nop
+	nop
+	.previous
+
 	rdpr		%pil, %g2
 	rdpr		%pil, %g2
 	wrpr		%g0, 15, %pil
 	wrpr		%g0, 15, %pil
 	sethi		%hi(109f), %g7
 	sethi		%hi(109f), %g7
@@ -453,62 +565,96 @@ xcall_flush_dcache_page_spitfire: /* %g1 == physical page address
 	nop
 	nop
 	nop
 	nop
 
 
-	.data
-
-errata32_hwbug:
-	.xword	0
-
-	.text
-
-	/* These two are not performance critical... */
-	.globl		xcall_flush_tlb_all_spitfire
-xcall_flush_tlb_all_spitfire:
-	/* Spitfire Errata #32 workaround. */
-	sethi		%hi(errata32_hwbug), %g4
-	stx		%g0, [%g4 + %lo(errata32_hwbug)]
-
-	clr		%g2
-	clr		%g3
-1:	ldxa		[%g3] ASI_DTLB_DATA_ACCESS, %g4
-	and		%g4, _PAGE_L, %g5
-	brnz,pn		%g5, 2f
-	 mov		TLB_TAG_ACCESS, %g7
-
-	stxa		%g0, [%g7] ASI_DMMU
-	membar		#Sync
-	stxa		%g0, [%g3] ASI_DTLB_DATA_ACCESS
+	/* %g5:	error
+	 * %g6:	tlb op
+	 */
+__hypervisor_tlb_xcall_error:
+	mov	%g5, %g4
+	mov	%g6, %g5
+	ba,pt	%xcc, etrap
+	 rd	%pc, %g7
+	mov	%l4, %o0
+	call	hypervisor_tlbop_error_xcall
+	 mov	%l5, %o1
+	ba,a,pt	%xcc, rtrap_clr_l6
+
+	.globl		__hypervisor_xcall_flush_tlb_mm
+__hypervisor_xcall_flush_tlb_mm: /* 21 insns */
+	/* %g5=ctx, g1,g2,g3,g4,g7=scratch, %g6=unusable */
+	mov		%o0, %g2
+	mov		%o1, %g3
+	mov		%o2, %g4
+	mov		%o3, %g1
+	mov		%o5, %g7
+	clr		%o0		/* ARG0: CPU lists unimplemented */
+	clr		%o1		/* ARG1: CPU lists unimplemented */
+	mov		%g5, %o2	/* ARG2: mmu context */
+	mov		HV_MMU_ALL, %o3	/* ARG3: flags */
+	mov		HV_FAST_MMU_DEMAP_CTX, %o5
+	ta		HV_FAST_TRAP
+	mov		HV_FAST_MMU_DEMAP_CTX, %g6
+	brnz,pn		%o0, __hypervisor_tlb_xcall_error
+	 mov		%o0, %g5
+	mov		%g2, %o0
+	mov		%g3, %o1
+	mov		%g4, %o2
+	mov		%g1, %o3
+	mov		%g7, %o5
 	membar		#Sync
 	membar		#Sync
+	retry
 
 
-	/* Spitfire Errata #32 workaround. */
-	sethi		%hi(errata32_hwbug), %g4
-	stx		%g0, [%g4 + %lo(errata32_hwbug)]
-
-2:	ldxa		[%g3] ASI_ITLB_DATA_ACCESS, %g4
-	and		%g4, _PAGE_L, %g5
-	brnz,pn		%g5, 2f
-	 mov		TLB_TAG_ACCESS, %g7
-
-	stxa		%g0, [%g7] ASI_IMMU
-	membar		#Sync
-	stxa		%g0, [%g3] ASI_ITLB_DATA_ACCESS
+	.globl		__hypervisor_xcall_flush_tlb_pending
+__hypervisor_xcall_flush_tlb_pending: /* 21 insns */
+	/* %g5=ctx, %g1=nr, %g7=vaddrs[], %g2,%g3,%g4,g6=scratch */
+	sllx		%g1, 3, %g1
+	mov		%o0, %g2
+	mov		%o1, %g3
+	mov		%o2, %g4
+1:	sub		%g1, (1 << 3), %g1
+	ldx		[%g7 + %g1], %o0	/* ARG0: virtual address */
+	mov		%g5, %o1		/* ARG1: mmu context */
+	mov		HV_MMU_ALL, %o2		/* ARG2: flags */
+	srlx		%o0, PAGE_SHIFT, %o0
+	sllx		%o0, PAGE_SHIFT, %o0
+	ta		HV_MMU_UNMAP_ADDR_TRAP
+	mov		HV_MMU_UNMAP_ADDR_TRAP, %g6
+	brnz,a,pn	%o0, __hypervisor_tlb_xcall_error
+	 mov		%o0, %g5
+	brnz,pt		%g1, 1b
+	 nop
+	mov		%g2, %o0
+	mov		%g3, %o1
+	mov		%g4, %o2
 	membar		#Sync
 	membar		#Sync
-
-	/* Spitfire Errata #32 workaround. */
-	sethi		%hi(errata32_hwbug), %g4
-	stx		%g0, [%g4 + %lo(errata32_hwbug)]
-
-2:	add		%g2, 1, %g2
-	cmp		%g2, SPITFIRE_HIGHEST_LOCKED_TLBENT
-	ble,pt		%icc, 1b
-	 sll		%g2, 3, %g3
-	flush		%g6
 	retry
 	retry
 
 
-	.globl		xcall_flush_tlb_all_cheetah
-xcall_flush_tlb_all_cheetah:
-	mov		0x80, %g2
-	stxa		%g0, [%g2] ASI_DMMU_DEMAP
-	stxa		%g0, [%g2] ASI_IMMU_DEMAP
+	.globl		__hypervisor_xcall_flush_tlb_kernel_range
+__hypervisor_xcall_flush_tlb_kernel_range: /* 25 insns */
+	/* %g1=start, %g7=end, g2,g3,g4,g5,g6=scratch */
+	sethi		%hi(PAGE_SIZE - 1), %g2
+	or		%g2, %lo(PAGE_SIZE - 1), %g2
+	andn		%g1, %g2, %g1
+	andn		%g7, %g2, %g7
+	sub		%g7, %g1, %g3
+	add		%g2, 1, %g2
+	sub		%g3, %g2, %g3
+	mov		%o0, %g2
+	mov		%o1, %g4
+	mov		%o2, %g7
+1:	add		%g1, %g3, %o0	/* ARG0: virtual address */
+	mov		0, %o1		/* ARG1: mmu context */
+	mov		HV_MMU_ALL, %o2	/* ARG2: flags */
+	ta		HV_MMU_UNMAP_ADDR_TRAP
+	mov		HV_MMU_UNMAP_ADDR_TRAP, %g6
+	brnz,pn		%o0, __hypervisor_tlb_xcall_error
+	 mov		%o0, %g5
+	sethi		%hi(PAGE_SIZE), %o2
+	brnz,pt		%g3, 1b
+	 sub		%g3, %o2, %g3
+	mov		%g2, %o0
+	mov		%g4, %o1
+	mov		%g7, %o2
+	membar		#Sync
 	retry
 	retry
 
 
 	/* These just get rescheduled to PIL vectors. */
 	/* These just get rescheduled to PIL vectors. */
@@ -527,4 +673,70 @@ xcall_capture:
 	wr		%g0, (1 << PIL_SMP_CAPTURE), %set_softint
 	wr		%g0, (1 << PIL_SMP_CAPTURE), %set_softint
 	retry
 	retry
 
 
+	.globl		xcall_new_mmu_context_version
+xcall_new_mmu_context_version:
+	wr		%g0, (1 << PIL_SMP_CTX_NEW_VERSION), %set_softint
+	retry
+
 #endif /* CONFIG_SMP */
 #endif /* CONFIG_SMP */
+
+
+	.globl		hypervisor_patch_cachetlbops
+hypervisor_patch_cachetlbops:
+	save		%sp, -128, %sp
+
+	sethi		%hi(__flush_tlb_mm), %o0
+	or		%o0, %lo(__flush_tlb_mm), %o0
+	sethi		%hi(__hypervisor_flush_tlb_mm), %o1
+	or		%o1, %lo(__hypervisor_flush_tlb_mm), %o1
+	call		tlb_patch_one
+	 mov		10, %o2
+
+	sethi		%hi(__flush_tlb_pending), %o0
+	or		%o0, %lo(__flush_tlb_pending), %o0
+	sethi		%hi(__hypervisor_flush_tlb_pending), %o1
+	or		%o1, %lo(__hypervisor_flush_tlb_pending), %o1
+	call		tlb_patch_one
+	 mov		16, %o2
+
+	sethi		%hi(__flush_tlb_kernel_range), %o0
+	or		%o0, %lo(__flush_tlb_kernel_range), %o0
+	sethi		%hi(__hypervisor_flush_tlb_kernel_range), %o1
+	or		%o1, %lo(__hypervisor_flush_tlb_kernel_range), %o1
+	call		tlb_patch_one
+	 mov		16, %o2
+
+#ifdef DCACHE_ALIASING_POSSIBLE
+	sethi		%hi(__flush_dcache_page), %o0
+	or		%o0, %lo(__flush_dcache_page), %o0
+	sethi		%hi(__hypervisor_flush_dcache_page), %o1
+	or		%o1, %lo(__hypervisor_flush_dcache_page), %o1
+	call		tlb_patch_one
+	 mov		2, %o2
+#endif /* DCACHE_ALIASING_POSSIBLE */
+
+#ifdef CONFIG_SMP
+	sethi		%hi(xcall_flush_tlb_mm), %o0
+	or		%o0, %lo(xcall_flush_tlb_mm), %o0
+	sethi		%hi(__hypervisor_xcall_flush_tlb_mm), %o1
+	or		%o1, %lo(__hypervisor_xcall_flush_tlb_mm), %o1
+	call		tlb_patch_one
+	 mov		21, %o2
+
+	sethi		%hi(xcall_flush_tlb_pending), %o0
+	or		%o0, %lo(xcall_flush_tlb_pending), %o0
+	sethi		%hi(__hypervisor_xcall_flush_tlb_pending), %o1
+	or		%o1, %lo(__hypervisor_xcall_flush_tlb_pending), %o1
+	call		tlb_patch_one
+	 mov		21, %o2
+
+	sethi		%hi(xcall_flush_tlb_kernel_range), %o0
+	or		%o0, %lo(xcall_flush_tlb_kernel_range), %o0
+	sethi		%hi(__hypervisor_xcall_flush_tlb_kernel_range), %o1
+	or		%o1, %lo(__hypervisor_xcall_flush_tlb_kernel_range), %o1
+	call		tlb_patch_one
+	 mov		25, %o2
+#endif /* CONFIG_SMP */
+
+	ret
+	 restore

+ 17 - 194
arch/sparc64/prom/cif.S

@@ -1,10 +1,12 @@
 /* cif.S: PROM entry/exit assembler trampolines.
 /* cif.S: PROM entry/exit assembler trampolines.
  *
  *
- * Copyright (C) 1996,1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
- * Copyright (C) 2005 David S. Miller <davem@davemloft.net>
+ * Copyright (C) 1996, 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
+ * Copyright (C) 2005, 2006 David S. Miller <davem@davemloft.net>
  */
  */
 
 
 #include <asm/pstate.h>
 #include <asm/pstate.h>
+#include <asm/cpudata.h>
+#include <asm/thread_info.h>
 
 
 	.text
 	.text
 	.globl	prom_cif_interface
 	.globl	prom_cif_interface
@@ -12,78 +14,16 @@ prom_cif_interface:
 	sethi	%hi(p1275buf), %o0
 	sethi	%hi(p1275buf), %o0
 	or	%o0, %lo(p1275buf), %o0
 	or	%o0, %lo(p1275buf), %o0
 	ldx	[%o0 + 0x010], %o1	! prom_cif_stack
 	ldx	[%o0 + 0x010], %o1	! prom_cif_stack
-	save	%o1, -0x190, %sp
+	save	%o1, -192, %sp
 	ldx	[%i0 + 0x008], %l2	! prom_cif_handler
 	ldx	[%i0 + 0x008], %l2	! prom_cif_handler
-	rdpr	%pstate, %l4
-	wrpr	%g0, 0x15, %pstate	! save alternate globals
-	stx	%g1, [%sp + 2047 + 0x0b0]
-	stx	%g2, [%sp + 2047 + 0x0b8]
-	stx	%g3, [%sp + 2047 + 0x0c0]
-	stx	%g4, [%sp + 2047 + 0x0c8]
-	stx	%g5, [%sp + 2047 + 0x0d0]
-	stx	%g6, [%sp + 2047 + 0x0d8]
-	stx	%g7, [%sp + 2047 + 0x0e0]
-	wrpr	%g0, 0x814, %pstate	! save interrupt globals
-	stx	%g1, [%sp + 2047 + 0x0e8]
-	stx	%g2, [%sp + 2047 + 0x0f0]
-	stx	%g3, [%sp + 2047 + 0x0f8]
-	stx	%g4, [%sp + 2047 + 0x100]
-	stx	%g5, [%sp + 2047 + 0x108]
-	stx	%g6, [%sp + 2047 + 0x110]
-	stx	%g7, [%sp + 2047 + 0x118]
-	wrpr	%g0, 0x14, %pstate	! save normal globals
-	stx	%g1, [%sp + 2047 + 0x120]
-	stx	%g2, [%sp + 2047 + 0x128]
-	stx	%g3, [%sp + 2047 + 0x130]
-	stx	%g4, [%sp + 2047 + 0x138]
-	stx	%g5, [%sp + 2047 + 0x140]
-	stx	%g6, [%sp + 2047 + 0x148]
-	stx	%g7, [%sp + 2047 + 0x150]
-	wrpr	%g0, 0x414, %pstate	! save mmu globals
-	stx	%g1, [%sp + 2047 + 0x158]
-	stx	%g2, [%sp + 2047 + 0x160]
-	stx	%g3, [%sp + 2047 + 0x168]
-	stx	%g4, [%sp + 2047 + 0x170]
-	stx	%g5, [%sp + 2047 + 0x178]
-	stx	%g6, [%sp + 2047 + 0x180]
-	stx	%g7, [%sp + 2047 + 0x188]
-	mov	%g1, %l0		! also save to locals, so we can handle
-	mov	%g2, %l1		! tlb faults later on, when accessing
-	mov	%g3, %l3		! the stack.
-	mov	%g7, %l5
-	wrpr	%l4, PSTATE_IE, %pstate	! turn off interrupts
+	mov	%g4, %l0
+	mov	%g5, %l1
+	mov	%g6, %l3
 	call	%l2
 	call	%l2
 	 add	%i0, 0x018, %o0		! prom_args
 	 add	%i0, 0x018, %o0		! prom_args
-	wrpr	%g0, 0x414, %pstate	! restore mmu globals
-	mov	%l0, %g1
-	mov	%l1, %g2
-	mov	%l3, %g3
-	mov	%l5, %g7
-	wrpr	%g0, 0x14, %pstate	! restore normal globals
-	ldx	[%sp + 2047 + 0x120], %g1
-	ldx	[%sp + 2047 + 0x128], %g2
-	ldx	[%sp + 2047 + 0x130], %g3
-	ldx	[%sp + 2047 + 0x138], %g4
-	ldx	[%sp + 2047 + 0x140], %g5
-	ldx	[%sp + 2047 + 0x148], %g6
-	ldx	[%sp + 2047 + 0x150], %g7
-	wrpr	%g0, 0x814, %pstate	! restore interrupt globals
-	ldx	[%sp + 2047 + 0x0e8], %g1
-	ldx	[%sp + 2047 + 0x0f0], %g2
-	ldx	[%sp + 2047 + 0x0f8], %g3
-	ldx	[%sp + 2047 + 0x100], %g4
-	ldx	[%sp + 2047 + 0x108], %g5
-	ldx	[%sp + 2047 + 0x110], %g6
-	ldx	[%sp + 2047 + 0x118], %g7
-	wrpr	%g0, 0x15, %pstate	! restore alternate globals
-	ldx	[%sp + 2047 + 0x0b0], %g1
-	ldx	[%sp + 2047 + 0x0b8], %g2
-	ldx	[%sp + 2047 + 0x0c0], %g3
-	ldx	[%sp + 2047 + 0x0c8], %g4
-	ldx	[%sp + 2047 + 0x0d0], %g5
-	ldx	[%sp + 2047 + 0x0d8], %g6
-	ldx	[%sp + 2047 + 0x0e0], %g7
-	wrpr	%l4, 0, %pstate	! restore original pstate
+	mov	%l0, %g4
+	mov	%l1, %g5
+	mov	%l3, %g6
 	ret
 	ret
 	 restore
 	 restore
 
 
@@ -91,135 +31,18 @@ prom_cif_interface:
 prom_cif_callback:
 prom_cif_callback:
 	sethi	%hi(p1275buf), %o1
 	sethi	%hi(p1275buf), %o1
 	or	%o1, %lo(p1275buf), %o1
 	or	%o1, %lo(p1275buf), %o1
-	save	%sp, -0x270, %sp
-	rdpr	%pstate, %l4
-	wrpr	%g0, 0x15, %pstate	! save PROM alternate globals
-	stx	%g1, [%sp + 2047 + 0x0b0]
-	stx	%g2, [%sp + 2047 + 0x0b8]
-	stx	%g3, [%sp + 2047 + 0x0c0]
-	stx	%g4, [%sp + 2047 + 0x0c8]
-	stx	%g5, [%sp + 2047 + 0x0d0]
-	stx	%g6, [%sp + 2047 + 0x0d8]
-	stx	%g7, [%sp + 2047 + 0x0e0]
-					! restore Linux alternate globals
-	ldx	[%sp + 2047 + 0x190], %g1
-	ldx	[%sp + 2047 + 0x198], %g2
-	ldx	[%sp + 2047 + 0x1a0], %g3
-	ldx	[%sp + 2047 + 0x1a8], %g4
-	ldx	[%sp + 2047 + 0x1b0], %g5
-	ldx	[%sp + 2047 + 0x1b8], %g6
-	ldx	[%sp + 2047 + 0x1c0], %g7
-	wrpr	%g0, 0x814, %pstate	! save PROM interrupt globals
-	stx	%g1, [%sp + 2047 + 0x0e8]
-	stx	%g2, [%sp + 2047 + 0x0f0]
-	stx	%g3, [%sp + 2047 + 0x0f8]
-	stx	%g4, [%sp + 2047 + 0x100]
-	stx	%g5, [%sp + 2047 + 0x108]
-	stx	%g6, [%sp + 2047 + 0x110]
-	stx	%g7, [%sp + 2047 + 0x118]
-					! restore Linux interrupt globals
-	ldx	[%sp + 2047 + 0x1c8], %g1
-	ldx	[%sp + 2047 + 0x1d0], %g2
-	ldx	[%sp + 2047 + 0x1d8], %g3
-	ldx	[%sp + 2047 + 0x1e0], %g4
-	ldx	[%sp + 2047 + 0x1e8], %g5
-	ldx	[%sp + 2047 + 0x1f0], %g6
-	ldx	[%sp + 2047 + 0x1f8], %g7
-	wrpr	%g0, 0x14, %pstate	! save PROM normal globals
-	stx	%g1, [%sp + 2047 + 0x120]
-	stx	%g2, [%sp + 2047 + 0x128]
-	stx	%g3, [%sp + 2047 + 0x130]
-	stx	%g4, [%sp + 2047 + 0x138]
-	stx	%g5, [%sp + 2047 + 0x140]
-	stx	%g6, [%sp + 2047 + 0x148]
-	stx	%g7, [%sp + 2047 + 0x150]
-					! restore Linux normal globals
-	ldx	[%sp + 2047 + 0x200], %g1
-	ldx	[%sp + 2047 + 0x208], %g2
-	ldx	[%sp + 2047 + 0x210], %g3
-	ldx	[%sp + 2047 + 0x218], %g4
-	ldx	[%sp + 2047 + 0x220], %g5
-	ldx	[%sp + 2047 + 0x228], %g6
-	ldx	[%sp + 2047 + 0x230], %g7
-	wrpr	%g0, 0x414, %pstate	! save PROM mmu globals
-	stx	%g1, [%sp + 2047 + 0x158]
-	stx	%g2, [%sp + 2047 + 0x160]
-	stx	%g3, [%sp + 2047 + 0x168]
-	stx	%g4, [%sp + 2047 + 0x170]
-	stx	%g5, [%sp + 2047 + 0x178]
-	stx	%g6, [%sp + 2047 + 0x180]
-	stx	%g7, [%sp + 2047 + 0x188]
-					! restore Linux mmu globals
-	ldx	[%sp + 2047 + 0x238], %o0
-	ldx	[%sp + 2047 + 0x240], %o1
-	ldx	[%sp + 2047 + 0x248], %l2
-	ldx	[%sp + 2047 + 0x250], %l3
-	ldx	[%sp + 2047 + 0x258], %l5
-	ldx	[%sp + 2047 + 0x260], %l6
-	ldx	[%sp + 2047 + 0x268], %l7
-					! switch to Linux tba
-	sethi	%hi(sparc64_ttable_tl0), %l1
-	rdpr	%tba, %l0		! save PROM tba
-	mov	%o0, %g1
-	mov	%o1, %g2
-	mov	%l2, %g3
-	mov	%l3, %g4
-	mov	%l5, %g5
-	mov	%l6, %g6
-	mov	%l7, %g7
-	wrpr	%l1, %tba		! install Linux tba
-	wrpr	%l4, 0, %pstate		! restore PSTATE
+	save	%sp, -192, %sp
+	TRAP_LOAD_THREAD_REG(%g6, %g1)
+	LOAD_PER_CPU_BASE(%g5, %g6, %g4, %g3, %o0)
+	ldx	[%g6 + TI_TASK], %g4
 	call	prom_world
 	call	prom_world
-	 mov	%g0, %o0
+	 mov	0, %o0
 	ldx	[%i1 + 0x000], %l2
 	ldx	[%i1 + 0x000], %l2
 	call	%l2
 	call	%l2
 	 mov	%i0, %o0
 	 mov	%i0, %o0
 	mov	%o0, %l1
 	mov	%o0, %l1
 	call	prom_world
 	call	prom_world
-	 or	%g0, 1, %o0
-	wrpr	%g0, 0x14, %pstate	! interrupts off
-					! restore PROM mmu globals
-	ldx	[%sp + 2047 + 0x158], %o0
-	ldx	[%sp + 2047 + 0x160], %o1
-	ldx	[%sp + 2047 + 0x168], %l2
-	ldx	[%sp + 2047 + 0x170], %l3
-	ldx	[%sp + 2047 + 0x178], %l5
-	ldx	[%sp + 2047 + 0x180], %l6
-	ldx	[%sp + 2047 + 0x188], %l7
-	wrpr	%g0, 0x414, %pstate	! restore PROM mmu globals
-	mov	%o0, %g1
-	mov	%o1, %g2
-	mov	%l2, %g3
-	mov	%l3, %g4
-	mov	%l5, %g5
-	mov	%l6, %g6
-	mov	%l7, %g7
-	wrpr	%l0, %tba		! restore PROM tba
-	wrpr	%g0, 0x14, %pstate	! restore PROM normal globals
-	ldx	[%sp + 2047 + 0x120], %g1
-	ldx	[%sp + 2047 + 0x128], %g2
-	ldx	[%sp + 2047 + 0x130], %g3
-	ldx	[%sp + 2047 + 0x138], %g4
-	ldx	[%sp + 2047 + 0x140], %g5
-	ldx	[%sp + 2047 + 0x148], %g6
-	ldx	[%sp + 2047 + 0x150], %g7
-	wrpr	%g0, 0x814, %pstate	! restore PROM interrupt globals
-	ldx	[%sp + 2047 + 0x0e8], %g1
-	ldx	[%sp + 2047 + 0x0f0], %g2
-	ldx	[%sp + 2047 + 0x0f8], %g3
-	ldx	[%sp + 2047 + 0x100], %g4
-	ldx	[%sp + 2047 + 0x108], %g5
-	ldx	[%sp + 2047 + 0x110], %g6
-	ldx	[%sp + 2047 + 0x118], %g7
-	wrpr	%g0, 0x15, %pstate	! restore PROM alternate globals
-	ldx	[%sp + 2047 + 0x0b0], %g1
-	ldx	[%sp + 2047 + 0x0b8], %g2
-	ldx	[%sp + 2047 + 0x0c0], %g3
-	ldx	[%sp + 2047 + 0x0c8], %g4
-	ldx	[%sp + 2047 + 0x0d0], %g5
-	ldx	[%sp + 2047 + 0x0d8], %g6
-	ldx	[%sp + 2047 + 0x0e0], %g7
-	wrpr	%l4, 0, %pstate
+	 mov	1, %o0
 	ret
 	ret
 	 restore %l1, 0, %o0
 	 restore %l1, 0, %o0
 
 

+ 6 - 0
arch/sparc64/prom/console.c

@@ -102,6 +102,9 @@ prom_query_input_device(void)
 	if (!strncmp (propb, "rsc", 3))
 	if (!strncmp (propb, "rsc", 3))
 		return PROMDEV_IRSC;
 		return PROMDEV_IRSC;
 
 
+	if (!strncmp (propb, "virtual-console", 3))
+		return PROMDEV_IVCONS;
+
 	if (strncmp (propb, "tty", 3) || !propb[3])
 	if (strncmp (propb, "tty", 3) || !propb[3])
 		return PROMDEV_I_UNK;
 		return PROMDEV_I_UNK;
 
 
@@ -143,6 +146,9 @@ prom_query_output_device(void)
 	if (!strncmp (propb, "rsc", 3))
 	if (!strncmp (propb, "rsc", 3))
 		return PROMDEV_ORSC;
 		return PROMDEV_ORSC;
 
 
+	if (!strncmp (propb, "virtual-console", 3))
+		return PROMDEV_OVCONS;
+
 	if (strncmp (propb, "tty", 3) || !propb[3])
 	if (strncmp (propb, "tty", 3) || !propb[3])
 		return PROMDEV_O_UNK;
 		return PROMDEV_O_UNK;
 
 

+ 8 - 52
arch/sparc64/prom/init.c

@@ -14,11 +14,10 @@
 #include <asm/openprom.h>
 #include <asm/openprom.h>
 #include <asm/oplib.h>
 #include <asm/oplib.h>
 
 
-enum prom_major_version prom_vers;
-unsigned int prom_rev, prom_prev;
+/* OBP version string. */
+char prom_version[80];
 
 
 /* The root node of the prom device tree. */
 /* The root node of the prom device tree. */
-int prom_root_node;
 int prom_stdin, prom_stdout;
 int prom_stdin, prom_stdout;
 int prom_chosen_node;
 int prom_chosen_node;
 
 
@@ -31,68 +30,25 @@ extern void prom_cif_init(void *, void *);
 
 
 void __init prom_init(void *cif_handler, void *cif_stack)
 void __init prom_init(void *cif_handler, void *cif_stack)
 {
 {
-	char buffer[80], *p;
-	int ints[3];
 	int node;
 	int node;
-	int i = 0;
-	int bufadjust;
-
-	prom_vers = PROM_P1275;
 
 
 	prom_cif_init(cif_handler, cif_stack);
 	prom_cif_init(cif_handler, cif_stack);
 
 
-	prom_root_node = prom_getsibling(0);
-	if((prom_root_node == 0) || (prom_root_node == -1))
-		prom_halt();
-
 	prom_chosen_node = prom_finddevice(prom_chosen_path);
 	prom_chosen_node = prom_finddevice(prom_chosen_path);
 	if (!prom_chosen_node || prom_chosen_node == -1)
 	if (!prom_chosen_node || prom_chosen_node == -1)
 		prom_halt();
 		prom_halt();
 
 
-	prom_stdin = prom_getint (prom_chosen_node, "stdin");
-	prom_stdout = prom_getint (prom_chosen_node, "stdout");
+	prom_stdin = prom_getint(prom_chosen_node, "stdin");
+	prom_stdout = prom_getint(prom_chosen_node, "stdout");
 
 
 	node = prom_finddevice("/openprom");
 	node = prom_finddevice("/openprom");
 	if (!node || node == -1)
 	if (!node || node == -1)
 		prom_halt();
 		prom_halt();
 
 
-	prom_getstring (node, "version", buffer, sizeof (buffer));
-
-	prom_printf ("\n");
-
-	if (strncmp (buffer, "OBP ", 4))
-		goto strange_version;
-
-	/*
-	 * Version field is expected to be 'OBP xx.yy.zz date...'
-	 * However, Sun can't stick to this format very well, so
-	 * we need to check for 'OBP  xx.yy.zz date...' and adjust
-	 * accordingly. -spot
-	 */
-
-	if (strncmp (buffer, "OBP  ", 5))
-		bufadjust = 4;
-	else
-		bufadjust = 5;
-
-	p = buffer + bufadjust;
-	while (p && isdigit(*p) && i < 3) {
-		ints[i++] = simple_strtoul(p, NULL, 0);
-		if ((p = strchr(p, '.')) != NULL)
-			p++;
-	}
-	if (i != 3)
-		goto strange_version;
-
-	prom_rev = ints[1];
-	prom_prev = (ints[0] << 16) | (ints[1] << 8) | ints[2];
-
-	printk ("PROMLIB: Sun IEEE Boot Prom %s\n", buffer + bufadjust);
+	prom_getstring(node, "version", prom_version, sizeof(prom_version));
 
 
-	/* Initialization successful. */
-	return;
+	prom_printf("\n");
 
 
-strange_version:
-	prom_printf ("Strange OBP version `%s'.\n", buffer);
-	prom_halt ();
+	printk("PROMLIB: Sun IEEE Boot Prom '%s'\n", prom_version);
+	printk("PROMLIB: Root node compatible: %s\n", prom_root_compatible);
 }
 }

+ 24 - 20
arch/sparc64/prom/misc.c

@@ -112,28 +112,20 @@ unsigned char prom_get_idprom(char *idbuf, int num_bytes)
 	return 0xff;
 	return 0xff;
 }
 }
 
 
-/* Get the major prom version number. */
-int prom_version(void)
-{
-	return PROM_P1275;
-}
-
-/* Get the prom plugin-revision. */
-int prom_getrev(void)
-{
-	return prom_rev;
-}
-
-/* Get the prom firmware print revision. */
-int prom_getprev(void)
+/* Install Linux trap table so PROM uses that instead of its own. */
+void prom_set_trap_table(unsigned long tba)
 {
 {
-	return prom_prev;
+	p1275_cmd("SUNW,set-trap-table",
+		  (P1275_ARG(0, P1275_ARG_IN_64B) |
+		   P1275_INOUT(1, 0)), tba);
 }
 }
 
 
-/* Install Linux trap table so PROM uses that instead of its own. */
-void prom_set_trap_table(unsigned long tba)
+void prom_set_trap_table_sun4v(unsigned long tba, unsigned long mmfsa)
 {
 {
-	p1275_cmd("SUNW,set-trap-table", P1275_INOUT(1, 0), tba);
+	p1275_cmd("SUNW,set-trap-table",
+		  (P1275_ARG(0, P1275_ARG_IN_64B) |
+		   P1275_ARG(1, P1275_ARG_IN_64B) |
+		   P1275_INOUT(2, 0)), tba, mmfsa);
 }
 }
 
 
 int prom_get_mmu_ihandle(void)
 int prom_get_mmu_ihandle(void)
@@ -303,9 +295,21 @@ int prom_wakeupsystem(void)
 }
 }
 
 
 #ifdef CONFIG_SMP
 #ifdef CONFIG_SMP
-void prom_startcpu(int cpunode, unsigned long pc, unsigned long o0)
+void prom_startcpu(int cpunode, unsigned long pc, unsigned long arg)
+{
+	p1275_cmd("SUNW,start-cpu", P1275_INOUT(3, 0), cpunode, pc, arg);
+}
+
+void prom_startcpu_cpuid(int cpuid, unsigned long pc, unsigned long arg)
+{
+	p1275_cmd("SUNW,start-cpu-by-cpuid", P1275_INOUT(3, 0),
+		  cpuid, pc, arg);
+}
+
+void prom_stopcpu_cpuid(int cpuid)
 {
 {
-	p1275_cmd("SUNW,start-cpu", P1275_INOUT(3, 0), cpunode, pc, o0);
+	p1275_cmd("SUNW,stop-cpu-by-cpuid", P1275_INOUT(1, 0),
+		  cpuid);
 }
 }
 
 
 void prom_stopself(void)
 void prom_stopself(void)

+ 0 - 11
arch/sparc64/prom/p1275.c

@@ -30,16 +30,6 @@ extern void prom_world(int);
 extern void prom_cif_interface(void);
 extern void prom_cif_interface(void);
 extern void prom_cif_callback(void);
 extern void prom_cif_callback(void);
 
 
-static inline unsigned long spitfire_get_primary_context(void)
-{
-	unsigned long ctx;
-
-	__asm__ __volatile__("ldxa	[%1] %2, %0"
-			     : "=r" (ctx)
-			     : "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU));
-	return ctx;
-}
-
 /*
 /*
  * This provides SMP safety on the p1275buf. prom_callback() drops this lock
  * This provides SMP safety on the p1275buf. prom_callback() drops this lock
  * to allow recursuve acquisition.
  * to allow recursuve acquisition.
@@ -55,7 +45,6 @@ long p1275_cmd(const char *service, long fmt, ...)
 	long attrs, x;
 	long attrs, x;
 	
 	
 	p = p1275buf.prom_buffer;
 	p = p1275buf.prom_buffer;
-	BUG_ON((spitfire_get_primary_context() & CTX_NR_MASK) != 0);
 
 
 	spin_lock_irqsave(&prom_entry_lock, flags);
 	spin_lock_irqsave(&prom_entry_lock, flags);
 
 

+ 6 - 3
arch/sparc64/prom/tree.c

@@ -51,7 +51,7 @@ prom_getparent(int node)
 __inline__ int
 __inline__ int
 __prom_getsibling(int node)
 __prom_getsibling(int node)
 {
 {
-	return p1275_cmd ("peer", P1275_INOUT(1, 1), node);
+	return p1275_cmd(prom_peer_name, P1275_INOUT(1, 1), node);
 }
 }
 
 
 __inline__ int
 __inline__ int
@@ -59,9 +59,12 @@ prom_getsibling(int node)
 {
 {
 	int sibnode;
 	int sibnode;
 
 
-	if(node == -1) return 0;
+	if (node == -1)
+		return 0;
 	sibnode = __prom_getsibling(node);
 	sibnode = __prom_getsibling(node);
-	if(sibnode == -1) return 0;
+	if (sibnode == -1)
+		return 0;
+
 	return sibnode;
 	return sibnode;
 }
 }
 
 

+ 2 - 2
arch/sparc64/solaris/misc.c

@@ -90,7 +90,7 @@ static u32 do_solaris_mmap(u32 addr, u32 len, u32 prot, u32 flags, u32 fd, u64 o
 	len = PAGE_ALIGN(len);
 	len = PAGE_ALIGN(len);
 	if(!(flags & MAP_FIXED))
 	if(!(flags & MAP_FIXED))
 		addr = 0;
 		addr = 0;
-	else if (len > 0xf0000000UL || addr > 0xf0000000UL - len)
+	else if (len > STACK_TOP32 || addr > STACK_TOP32 - len)
 		goto out_putf;
 		goto out_putf;
 	ret_type = flags & _MAP_NEW;
 	ret_type = flags & _MAP_NEW;
 	flags &= ~_MAP_NEW;
 	flags &= ~_MAP_NEW;
@@ -102,7 +102,7 @@ static u32 do_solaris_mmap(u32 addr, u32 len, u32 prot, u32 flags, u32 fd, u64 o
 			 (unsigned long) prot, (unsigned long) flags, off);
 			 (unsigned long) prot, (unsigned long) flags, off);
 	up_write(&current->mm->mmap_sem);
 	up_write(&current->mm->mmap_sem);
 	if(!ret_type)
 	if(!ret_type)
-		retval = ((retval < 0xf0000000) ? 0 : retval);
+		retval = ((retval < STACK_TOP32) ? 0 : retval);
 	                        
 	                        
 out_putf:
 out_putf:
 	if (file)
 	if (file)

+ 49 - 95
block/as-iosched.c

@@ -182,6 +182,9 @@ struct as_rq {
 
 
 static kmem_cache_t *arq_pool;
 static kmem_cache_t *arq_pool;
 
 
+static atomic_t ioc_count = ATOMIC_INIT(0);
+static struct completion *ioc_gone;
+
 static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq);
 static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq);
 static void as_antic_stop(struct as_data *ad);
 static void as_antic_stop(struct as_data *ad);
 
 
@@ -193,6 +196,15 @@ static void as_antic_stop(struct as_data *ad);
 static void free_as_io_context(struct as_io_context *aic)
 static void free_as_io_context(struct as_io_context *aic)
 {
 {
 	kfree(aic);
 	kfree(aic);
+	if (atomic_dec_and_test(&ioc_count) && ioc_gone)
+		complete(ioc_gone);
+}
+
+static void as_trim(struct io_context *ioc)
+{
+	if (ioc->aic)
+		free_as_io_context(ioc->aic);
+	ioc->aic = NULL;
 }
 }
 
 
 /* Called when the task exits */
 /* Called when the task exits */
@@ -220,6 +232,7 @@ static struct as_io_context *alloc_as_io_context(void)
 		ret->seek_total = 0;
 		ret->seek_total = 0;
 		ret->seek_samples = 0;
 		ret->seek_samples = 0;
 		ret->seek_mean = 0;
 		ret->seek_mean = 0;
+		atomic_inc(&ioc_count);
 	}
 	}
 
 
 	return ret;
 	return ret;
@@ -1696,11 +1709,6 @@ static int as_init_queue(request_queue_t *q, elevator_t *e)
 /*
 /*
  * sysfs parts below
  * sysfs parts below
  */
  */
-struct as_fs_entry {
-	struct attribute attr;
-	ssize_t (*show)(struct as_data *, char *);
-	ssize_t (*store)(struct as_data *, const char *, size_t);
-};
 
 
 static ssize_t
 static ssize_t
 as_var_show(unsigned int var, char *page)
 as_var_show(unsigned int var, char *page)
@@ -1717,8 +1725,9 @@ as_var_store(unsigned long *var, const char *page, size_t count)
 	return count;
 	return count;
 }
 }
 
 
-static ssize_t as_est_show(struct as_data *ad, char *page)
+static ssize_t est_time_show(elevator_t *e, char *page)
 {
 {
+	struct as_data *ad = e->elevator_data;
 	int pos = 0;
 	int pos = 0;
 
 
 	pos += sprintf(page+pos, "%lu %% exit probability\n",
 	pos += sprintf(page+pos, "%lu %% exit probability\n",
@@ -1734,21 +1743,23 @@ static ssize_t as_est_show(struct as_data *ad, char *page)
 }
 }
 
 
 #define SHOW_FUNCTION(__FUNC, __VAR)				\
 #define SHOW_FUNCTION(__FUNC, __VAR)				\
-static ssize_t __FUNC(struct as_data *ad, char *page)		\
+static ssize_t __FUNC(elevator_t *e, char *page)		\
 {								\
 {								\
+	struct as_data *ad = e->elevator_data;			\
 	return as_var_show(jiffies_to_msecs((__VAR)), (page));	\
 	return as_var_show(jiffies_to_msecs((__VAR)), (page));	\
 }
 }
-SHOW_FUNCTION(as_readexpire_show, ad->fifo_expire[REQ_SYNC]);
-SHOW_FUNCTION(as_writeexpire_show, ad->fifo_expire[REQ_ASYNC]);
-SHOW_FUNCTION(as_anticexpire_show, ad->antic_expire);
-SHOW_FUNCTION(as_read_batchexpire_show, ad->batch_expire[REQ_SYNC]);
-SHOW_FUNCTION(as_write_batchexpire_show, ad->batch_expire[REQ_ASYNC]);
+SHOW_FUNCTION(as_read_expire_show, ad->fifo_expire[REQ_SYNC]);
+SHOW_FUNCTION(as_write_expire_show, ad->fifo_expire[REQ_ASYNC]);
+SHOW_FUNCTION(as_antic_expire_show, ad->antic_expire);
+SHOW_FUNCTION(as_read_batch_expire_show, ad->batch_expire[REQ_SYNC]);
+SHOW_FUNCTION(as_write_batch_expire_show, ad->batch_expire[REQ_ASYNC]);
 #undef SHOW_FUNCTION
 #undef SHOW_FUNCTION
 
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)				\
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)				\
-static ssize_t __FUNC(struct as_data *ad, const char *page, size_t count)	\
+static ssize_t __FUNC(elevator_t *e, const char *page, size_t count)	\
 {									\
 {									\
-	int ret = as_var_store(__PTR, (page), count);		\
+	struct as_data *ad = e->elevator_data;				\
+	int ret = as_var_store(__PTR, (page), count);			\
 	if (*(__PTR) < (MIN))						\
 	if (*(__PTR) < (MIN))						\
 		*(__PTR) = (MIN);					\
 		*(__PTR) = (MIN);					\
 	else if (*(__PTR) > (MAX))					\
 	else if (*(__PTR) > (MAX))					\
@@ -1756,90 +1767,26 @@ static ssize_t __FUNC(struct as_data *ad, const char *page, size_t count)	\
 	*(__PTR) = msecs_to_jiffies(*(__PTR));				\
 	*(__PTR) = msecs_to_jiffies(*(__PTR));				\
 	return ret;							\
 	return ret;							\
 }
 }
-STORE_FUNCTION(as_readexpire_store, &ad->fifo_expire[REQ_SYNC], 0, INT_MAX);
-STORE_FUNCTION(as_writeexpire_store, &ad->fifo_expire[REQ_ASYNC], 0, INT_MAX);
-STORE_FUNCTION(as_anticexpire_store, &ad->antic_expire, 0, INT_MAX);
-STORE_FUNCTION(as_read_batchexpire_store,
+STORE_FUNCTION(as_read_expire_store, &ad->fifo_expire[REQ_SYNC], 0, INT_MAX);
+STORE_FUNCTION(as_write_expire_store, &ad->fifo_expire[REQ_ASYNC], 0, INT_MAX);
+STORE_FUNCTION(as_antic_expire_store, &ad->antic_expire, 0, INT_MAX);
+STORE_FUNCTION(as_read_batch_expire_store,
 			&ad->batch_expire[REQ_SYNC], 0, INT_MAX);
 			&ad->batch_expire[REQ_SYNC], 0, INT_MAX);
-STORE_FUNCTION(as_write_batchexpire_store,
+STORE_FUNCTION(as_write_batch_expire_store,
 			&ad->batch_expire[REQ_ASYNC], 0, INT_MAX);
 			&ad->batch_expire[REQ_ASYNC], 0, INT_MAX);
 #undef STORE_FUNCTION
 #undef STORE_FUNCTION
 
 
-static struct as_fs_entry as_est_entry = {
-	.attr = {.name = "est_time", .mode = S_IRUGO },
-	.show = as_est_show,
-};
-static struct as_fs_entry as_readexpire_entry = {
-	.attr = {.name = "read_expire", .mode = S_IRUGO | S_IWUSR },
-	.show = as_readexpire_show,
-	.store = as_readexpire_store,
-};
-static struct as_fs_entry as_writeexpire_entry = {
-	.attr = {.name = "write_expire", .mode = S_IRUGO | S_IWUSR },
-	.show = as_writeexpire_show,
-	.store = as_writeexpire_store,
-};
-static struct as_fs_entry as_anticexpire_entry = {
-	.attr = {.name = "antic_expire", .mode = S_IRUGO | S_IWUSR },
-	.show = as_anticexpire_show,
-	.store = as_anticexpire_store,
-};
-static struct as_fs_entry as_read_batchexpire_entry = {
-	.attr = {.name = "read_batch_expire", .mode = S_IRUGO | S_IWUSR },
-	.show = as_read_batchexpire_show,
-	.store = as_read_batchexpire_store,
-};
-static struct as_fs_entry as_write_batchexpire_entry = {
-	.attr = {.name = "write_batch_expire", .mode = S_IRUGO | S_IWUSR },
-	.show = as_write_batchexpire_show,
-	.store = as_write_batchexpire_store,
-};
-
-static struct attribute *default_attrs[] = {
-	&as_est_entry.attr,
-	&as_readexpire_entry.attr,
-	&as_writeexpire_entry.attr,
-	&as_anticexpire_entry.attr,
-	&as_read_batchexpire_entry.attr,
-	&as_write_batchexpire_entry.attr,
-	NULL,
-};
-
-#define to_as(atr) container_of((atr), struct as_fs_entry, attr)
-
-static ssize_t
-as_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
-{
-	elevator_t *e = container_of(kobj, elevator_t, kobj);
-	struct as_fs_entry *entry = to_as(attr);
-
-	if (!entry->show)
-		return -EIO;
-
-	return entry->show(e->elevator_data, page);
-}
-
-static ssize_t
-as_attr_store(struct kobject *kobj, struct attribute *attr,
-		    const char *page, size_t length)
-{
-	elevator_t *e = container_of(kobj, elevator_t, kobj);
-	struct as_fs_entry *entry = to_as(attr);
-
-	if (!entry->store)
-		return -EIO;
-
-	return entry->store(e->elevator_data, page, length);
-}
-
-static struct sysfs_ops as_sysfs_ops = {
-	.show	= as_attr_show,
-	.store	= as_attr_store,
-};
-
-static struct kobj_type as_ktype = {
-	.sysfs_ops	= &as_sysfs_ops,
-	.default_attrs	= default_attrs,
+#define AS_ATTR(name) \
+	__ATTR(name, S_IRUGO|S_IWUSR, as_##name##_show, as_##name##_store)
+
+static struct elv_fs_entry as_attrs[] = {
+	__ATTR_RO(est_time),
+	AS_ATTR(read_expire),
+	AS_ATTR(write_expire),
+	AS_ATTR(antic_expire),
+	AS_ATTR(read_batch_expire),
+	AS_ATTR(write_batch_expire),
+	__ATTR_NULL
 };
 };
 
 
 static struct elevator_type iosched_as = {
 static struct elevator_type iosched_as = {
@@ -1860,9 +1807,10 @@ static struct elevator_type iosched_as = {
 		.elevator_may_queue_fn =	as_may_queue,
 		.elevator_may_queue_fn =	as_may_queue,
 		.elevator_init_fn =		as_init_queue,
 		.elevator_init_fn =		as_init_queue,
 		.elevator_exit_fn =		as_exit_queue,
 		.elevator_exit_fn =		as_exit_queue,
+		.trim =				as_trim,
 	},
 	},
 
 
-	.elevator_ktype = &as_ktype,
+	.elevator_attrs = as_attrs,
 	.elevator_name = "anticipatory",
 	.elevator_name = "anticipatory",
 	.elevator_owner = THIS_MODULE,
 	.elevator_owner = THIS_MODULE,
 };
 };
@@ -1893,7 +1841,13 @@ static int __init as_init(void)
 
 
 static void __exit as_exit(void)
 static void __exit as_exit(void)
 {
 {
+	DECLARE_COMPLETION(all_gone);
 	elv_unregister(&iosched_as);
 	elv_unregister(&iosched_as);
+	ioc_gone = &all_gone;
+	barrier();
+	if (atomic_read(&ioc_count))
+		complete(ioc_gone);
+	synchronize_rcu();
 	kmem_cache_destroy(arq_pool);
 	kmem_cache_destroy(arq_pool);
 }
 }
 
 

+ 175 - 179
block/cfq-iosched.c

@@ -6,21 +6,13 @@
  *
  *
  *  Copyright (C) 2003 Jens Axboe <axboe@suse.de>
  *  Copyright (C) 2003 Jens Axboe <axboe@suse.de>
  */
  */
-#include <linux/kernel.h>
-#include <linux/fs.h>
-#include <linux/blkdev.h>
-#include <linux/elevator.h>
-#include <linux/bio.h>
 #include <linux/config.h>
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/compiler.h>
+#include <linux/blkdev.h>
+#include <linux/elevator.h>
 #include <linux/hash.h>
 #include <linux/hash.h>
 #include <linux/rbtree.h>
 #include <linux/rbtree.h>
-#include <linux/mempool.h>
 #include <linux/ioprio.h>
 #include <linux/ioprio.h>
-#include <linux/writeback.h>
 
 
 /*
 /*
  * tunables
  * tunables
@@ -47,6 +39,8 @@ static int cfq_slice_idle = HZ / 100;
  */
  */
 static const int cfq_max_depth = 2;
 static const int cfq_max_depth = 2;
 
 
+static DEFINE_RWLOCK(cfq_exit_lock);
+
 /*
 /*
  * for the hash of cfqq inside the cfqd
  * for the hash of cfqq inside the cfqd
  */
  */
@@ -89,6 +83,9 @@ static kmem_cache_t *crq_pool;
 static kmem_cache_t *cfq_pool;
 static kmem_cache_t *cfq_pool;
 static kmem_cache_t *cfq_ioc_pool;
 static kmem_cache_t *cfq_ioc_pool;
 
 
+static atomic_t ioc_count = ATOMIC_INIT(0);
+static struct completion *ioc_gone;
+
 #define CFQ_PRIO_LISTS		IOPRIO_BE_NR
 #define CFQ_PRIO_LISTS		IOPRIO_BE_NR
 #define cfq_class_idle(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
 #define cfq_class_idle(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
 #define cfq_class_be(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_BE)
 #define cfq_class_be(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_BE)
@@ -109,7 +106,6 @@ static kmem_cache_t *cfq_ioc_pool;
  * Per block device queue structure
  * Per block device queue structure
  */
  */
 struct cfq_data {
 struct cfq_data {
-	atomic_t ref;
 	request_queue_t *queue;
 	request_queue_t *queue;
 
 
 	/*
 	/*
@@ -175,6 +171,8 @@ struct cfq_data {
 	unsigned int cfq_slice_async_rq;
 	unsigned int cfq_slice_async_rq;
 	unsigned int cfq_slice_idle;
 	unsigned int cfq_slice_idle;
 	unsigned int cfq_max_depth;
 	unsigned int cfq_max_depth;
+
+	struct list_head cic_list;
 };
 };
 
 
 /*
 /*
@@ -288,7 +286,7 @@ CFQ_CRQ_FNS(is_sync);
 
 
 static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short);
 static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short);
 static void cfq_dispatch_insert(request_queue_t *, struct cfq_rq *);
 static void cfq_dispatch_insert(request_queue_t *, struct cfq_rq *);
-static void cfq_put_cfqd(struct cfq_data *cfqd);
+static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, gfp_t gfp_mask);
 
 
 #define process_sync(tsk)	((tsk)->flags & PF_SYNCWRITE)
 #define process_sync(tsk)	((tsk)->flags & PF_SYNCWRITE)
 
 
@@ -1160,8 +1158,6 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 	if (unlikely(cfqd->active_queue == cfqq))
 	if (unlikely(cfqd->active_queue == cfqq))
 		__cfq_slice_expired(cfqd, cfqq, 0);
 		__cfq_slice_expired(cfqd, cfqq, 0);
 
 
-	cfq_put_cfqd(cfqq->cfqd);
-
 	/*
 	/*
 	 * it's on the empty list and still hashed
 	 * it's on the empty list and still hashed
 	 */
 	 */
@@ -1179,7 +1175,7 @@ __cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned int prio,
 
 
 	hlist_for_each_safe(entry, next, hash_list) {
 	hlist_for_each_safe(entry, next, hash_list) {
 		struct cfq_queue *__cfqq = list_entry_qhash(entry);
 		struct cfq_queue *__cfqq = list_entry_qhash(entry);
-		const unsigned short __p = IOPRIO_PRIO_VALUE(__cfqq->ioprio_class, __cfqq->ioprio);
+		const unsigned short __p = IOPRIO_PRIO_VALUE(__cfqq->org_ioprio_class, __cfqq->org_ioprio);
 
 
 		if (__cfqq->key == key && (__p == prio || prio == CFQ_KEY_ANY))
 		if (__cfqq->key == key && (__p == prio || prio == CFQ_KEY_ANY))
 			return __cfqq;
 			return __cfqq;
@@ -1198,13 +1194,24 @@ static void cfq_free_io_context(struct cfq_io_context *cic)
 {
 {
 	struct cfq_io_context *__cic;
 	struct cfq_io_context *__cic;
 	struct list_head *entry, *next;
 	struct list_head *entry, *next;
+	int freed = 1;
 
 
 	list_for_each_safe(entry, next, &cic->list) {
 	list_for_each_safe(entry, next, &cic->list) {
 		__cic = list_entry(entry, struct cfq_io_context, list);
 		__cic = list_entry(entry, struct cfq_io_context, list);
 		kmem_cache_free(cfq_ioc_pool, __cic);
 		kmem_cache_free(cfq_ioc_pool, __cic);
+		freed++;
 	}
 	}
 
 
 	kmem_cache_free(cfq_ioc_pool, cic);
 	kmem_cache_free(cfq_ioc_pool, cic);
+	if (atomic_sub_and_test(freed, &ioc_count) && ioc_gone)
+		complete(ioc_gone);
+}
+
+static void cfq_trim(struct io_context *ioc)
+{
+	ioc->set_ioprio = NULL;
+	if (ioc->cic)
+		cfq_free_io_context(ioc->cic);
 }
 }
 
 
 /*
 /*
@@ -1212,25 +1219,37 @@ static void cfq_free_io_context(struct cfq_io_context *cic)
  */
  */
 static void cfq_exit_single_io_context(struct cfq_io_context *cic)
 static void cfq_exit_single_io_context(struct cfq_io_context *cic)
 {
 {
-	struct cfq_data *cfqd = cic->cfqq->cfqd;
-	request_queue_t *q = cfqd->queue;
+	struct cfq_data *cfqd = cic->key;
+	request_queue_t *q;
+
+	if (!cfqd)
+		return;
+
+	q = cfqd->queue;
 
 
 	WARN_ON(!irqs_disabled());
 	WARN_ON(!irqs_disabled());
 
 
 	spin_lock(q->queue_lock);
 	spin_lock(q->queue_lock);
 
 
-	if (unlikely(cic->cfqq == cfqd->active_queue))
-		__cfq_slice_expired(cfqd, cic->cfqq, 0);
+	if (cic->cfqq[ASYNC]) {
+		if (unlikely(cic->cfqq[ASYNC] == cfqd->active_queue))
+			__cfq_slice_expired(cfqd, cic->cfqq[ASYNC], 0);
+		cfq_put_queue(cic->cfqq[ASYNC]);
+		cic->cfqq[ASYNC] = NULL;
+	}
+
+	if (cic->cfqq[SYNC]) {
+		if (unlikely(cic->cfqq[SYNC] == cfqd->active_queue))
+			__cfq_slice_expired(cfqd, cic->cfqq[SYNC], 0);
+		cfq_put_queue(cic->cfqq[SYNC]);
+		cic->cfqq[SYNC] = NULL;
+	}
 
 
-	cfq_put_queue(cic->cfqq);
-	cic->cfqq = NULL;
+	cic->key = NULL;
+	list_del_init(&cic->queue_list);
 	spin_unlock(q->queue_lock);
 	spin_unlock(q->queue_lock);
 }
 }
 
 
-/*
- * Another task may update the task cic list, if it is doing a queue lookup
- * on its behalf. cfq_cic_lock excludes such concurrent updates
- */
 static void cfq_exit_io_context(struct cfq_io_context *cic)
 static void cfq_exit_io_context(struct cfq_io_context *cic)
 {
 {
 	struct cfq_io_context *__cic;
 	struct cfq_io_context *__cic;
@@ -1242,12 +1261,14 @@ static void cfq_exit_io_context(struct cfq_io_context *cic)
 	/*
 	/*
 	 * put the reference this task is holding to the various queues
 	 * put the reference this task is holding to the various queues
 	 */
 	 */
+	read_lock(&cfq_exit_lock);
 	list_for_each(entry, &cic->list) {
 	list_for_each(entry, &cic->list) {
 		__cic = list_entry(entry, struct cfq_io_context, list);
 		__cic = list_entry(entry, struct cfq_io_context, list);
 		cfq_exit_single_io_context(__cic);
 		cfq_exit_single_io_context(__cic);
 	}
 	}
 
 
 	cfq_exit_single_io_context(cic);
 	cfq_exit_single_io_context(cic);
+	read_unlock(&cfq_exit_lock);
 	local_irq_restore(flags);
 	local_irq_restore(flags);
 }
 }
 
 
@@ -1258,7 +1279,8 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 
 
 	if (cic) {
 	if (cic) {
 		INIT_LIST_HEAD(&cic->list);
 		INIT_LIST_HEAD(&cic->list);
-		cic->cfqq = NULL;
+		cic->cfqq[ASYNC] = NULL;
+		cic->cfqq[SYNC] = NULL;
 		cic->key = NULL;
 		cic->key = NULL;
 		cic->last_end_request = jiffies;
 		cic->last_end_request = jiffies;
 		cic->ttime_total = 0;
 		cic->ttime_total = 0;
@@ -1266,6 +1288,8 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 		cic->ttime_mean = 0;
 		cic->ttime_mean = 0;
 		cic->dtor = cfq_free_io_context;
 		cic->dtor = cfq_free_io_context;
 		cic->exit = cfq_exit_io_context;
 		cic->exit = cfq_exit_io_context;
+		INIT_LIST_HEAD(&cic->queue_list);
+		atomic_inc(&ioc_count);
 	}
 	}
 
 
 	return cic;
 	return cic;
@@ -1318,14 +1342,27 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq)
 	cfq_clear_cfqq_prio_changed(cfqq);
 	cfq_clear_cfqq_prio_changed(cfqq);
 }
 }
 
 
-static inline void changed_ioprio(struct cfq_queue *cfqq)
+static inline void changed_ioprio(struct cfq_io_context *cic)
 {
 {
-	if (cfqq) {
-		struct cfq_data *cfqd = cfqq->cfqd;
-
+	struct cfq_data *cfqd = cic->key;
+	struct cfq_queue *cfqq;
+	if (cfqd) {
 		spin_lock(cfqd->queue->queue_lock);
 		spin_lock(cfqd->queue->queue_lock);
-		cfq_mark_cfqq_prio_changed(cfqq);
-		cfq_init_prio_data(cfqq);
+		cfqq = cic->cfqq[ASYNC];
+		if (cfqq) {
+			struct cfq_queue *new_cfqq;
+			new_cfqq = cfq_get_queue(cfqd, CFQ_KEY_ASYNC,
+						cic->ioc->task, GFP_ATOMIC);
+			if (new_cfqq) {
+				cic->cfqq[ASYNC] = new_cfqq;
+				cfq_put_queue(cfqq);
+			}
+		}
+		cfqq = cic->cfqq[SYNC];
+		if (cfqq) {
+			cfq_mark_cfqq_prio_changed(cfqq);
+			cfq_init_prio_data(cfqq);
+		}
 		spin_unlock(cfqd->queue->queue_lock);
 		spin_unlock(cfqd->queue->queue_lock);
 	}
 	}
 }
 }
@@ -1335,24 +1372,32 @@ static inline void changed_ioprio(struct cfq_queue *cfqq)
  */
  */
 static int cfq_ioc_set_ioprio(struct io_context *ioc, unsigned int ioprio)
 static int cfq_ioc_set_ioprio(struct io_context *ioc, unsigned int ioprio)
 {
 {
-	struct cfq_io_context *cic = ioc->cic;
+	struct cfq_io_context *cic;
+
+	write_lock(&cfq_exit_lock);
+
+	cic = ioc->cic;
 
 
-	changed_ioprio(cic->cfqq);
+	changed_ioprio(cic);
 
 
 	list_for_each_entry(cic, &cic->list, list)
 	list_for_each_entry(cic, &cic->list, list)
-		changed_ioprio(cic->cfqq);
+		changed_ioprio(cic);
+
+	write_unlock(&cfq_exit_lock);
 
 
 	return 0;
 	return 0;
 }
 }
 
 
 static struct cfq_queue *
 static struct cfq_queue *
-cfq_get_queue(struct cfq_data *cfqd, unsigned int key, unsigned short ioprio,
+cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk,
 	      gfp_t gfp_mask)
 	      gfp_t gfp_mask)
 {
 {
 	const int hashval = hash_long(key, CFQ_QHASH_SHIFT);
 	const int hashval = hash_long(key, CFQ_QHASH_SHIFT);
 	struct cfq_queue *cfqq, *new_cfqq = NULL;
 	struct cfq_queue *cfqq, *new_cfqq = NULL;
+	unsigned short ioprio;
 
 
 retry:
 retry:
+	ioprio = tsk->ioprio;
 	cfqq = __cfq_find_cfq_hash(cfqd, key, ioprio, hashval);
 	cfqq = __cfq_find_cfq_hash(cfqd, key, ioprio, hashval);
 
 
 	if (!cfqq) {
 	if (!cfqq) {
@@ -1381,7 +1426,6 @@ retry:
 		hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
 		hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
 		atomic_set(&cfqq->ref, 0);
 		atomic_set(&cfqq->ref, 0);
 		cfqq->cfqd = cfqd;
 		cfqq->cfqd = cfqd;
-		atomic_inc(&cfqd->ref);
 		cfqq->service_last = 0;
 		cfqq->service_last = 0;
 		/*
 		/*
 		 * set ->slice_left to allow preemption for a new process
 		 * set ->slice_left to allow preemption for a new process
@@ -1419,6 +1463,7 @@ cfq_get_io_context(struct cfq_data *cfqd, pid_t pid, gfp_t gfp_mask)
 	if (!ioc)
 	if (!ioc)
 		return NULL;
 		return NULL;
 
 
+restart:
 	if ((cic = ioc->cic) == NULL) {
 	if ((cic = ioc->cic) == NULL) {
 		cic = cfq_alloc_io_context(cfqd, gfp_mask);
 		cic = cfq_alloc_io_context(cfqd, gfp_mask);
 
 
@@ -1429,11 +1474,13 @@ cfq_get_io_context(struct cfq_data *cfqd, pid_t pid, gfp_t gfp_mask)
 		 * manually increment generic io_context usage count, it
 		 * manually increment generic io_context usage count, it
 		 * cannot go away since we are already holding one ref to it
 		 * cannot go away since we are already holding one ref to it
 		 */
 		 */
-		ioc->cic = cic;
-		ioc->set_ioprio = cfq_ioc_set_ioprio;
 		cic->ioc = ioc;
 		cic->ioc = ioc;
 		cic->key = cfqd;
 		cic->key = cfqd;
-		atomic_inc(&cfqd->ref);
+		read_lock(&cfq_exit_lock);
+		ioc->set_ioprio = cfq_ioc_set_ioprio;
+		ioc->cic = cic;
+		list_add(&cic->queue_list, &cfqd->cic_list);
+		read_unlock(&cfq_exit_lock);
 	} else {
 	} else {
 		struct cfq_io_context *__cic;
 		struct cfq_io_context *__cic;
 
 
@@ -1443,6 +1490,20 @@ cfq_get_io_context(struct cfq_data *cfqd, pid_t pid, gfp_t gfp_mask)
 		if (cic->key == cfqd)
 		if (cic->key == cfqd)
 			goto out;
 			goto out;
 
 
+		if (unlikely(!cic->key)) {
+			read_lock(&cfq_exit_lock);
+			if (list_empty(&cic->list))
+				ioc->cic = NULL;
+			else
+				ioc->cic = list_entry(cic->list.next,
+						      struct cfq_io_context,
+						      list);
+			read_unlock(&cfq_exit_lock);
+			kmem_cache_free(cfq_ioc_pool, cic);
+			atomic_dec(&ioc_count);
+			goto restart;
+		}
+
 		/*
 		/*
 		 * cic exists, check if we already are there. linear search
 		 * cic exists, check if we already are there. linear search
 		 * should be ok here, the list will usually not be more than
 		 * should be ok here, the list will usually not be more than
@@ -1457,6 +1518,14 @@ cfq_get_io_context(struct cfq_data *cfqd, pid_t pid, gfp_t gfp_mask)
 				cic = __cic;
 				cic = __cic;
 				goto out;
 				goto out;
 			}
 			}
+			if (unlikely(!__cic->key)) {
+				read_lock(&cfq_exit_lock);
+				list_del(&__cic->list);
+				read_unlock(&cfq_exit_lock);
+				kmem_cache_free(cfq_ioc_pool, __cic);
+				atomic_dec(&ioc_count);
+				goto restart;
+			}
 		}
 		}
 
 
 		/*
 		/*
@@ -1469,8 +1538,10 @@ cfq_get_io_context(struct cfq_data *cfqd, pid_t pid, gfp_t gfp_mask)
 
 
 		__cic->ioc = ioc;
 		__cic->ioc = ioc;
 		__cic->key = cfqd;
 		__cic->key = cfqd;
-		atomic_inc(&cfqd->ref);
+		read_lock(&cfq_exit_lock);
 		list_add(&__cic->list, &cic->list);
 		list_add(&__cic->list, &cic->list);
+		list_add(&__cic->queue_list, &cfqd->cic_list);
+		read_unlock(&cfq_exit_lock);
 		cic = __cic;
 		cic = __cic;
 	}
 	}
 
 
@@ -1890,6 +1961,7 @@ cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
 	struct cfq_queue *cfqq;
 	struct cfq_queue *cfqq;
 	struct cfq_rq *crq;
 	struct cfq_rq *crq;
 	unsigned long flags;
 	unsigned long flags;
+	int is_sync = key != CFQ_KEY_ASYNC;
 
 
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 
 
@@ -1900,14 +1972,14 @@ cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
 	if (!cic)
 	if (!cic)
 		goto queue_fail;
 		goto queue_fail;
 
 
-	if (!cic->cfqq) {
-		cfqq = cfq_get_queue(cfqd, key, tsk->ioprio, gfp_mask);
+	if (!cic->cfqq[is_sync]) {
+		cfqq = cfq_get_queue(cfqd, key, tsk, gfp_mask);
 		if (!cfqq)
 		if (!cfqq)
 			goto queue_fail;
 			goto queue_fail;
 
 
-		cic->cfqq = cfqq;
+		cic->cfqq[is_sync] = cfqq;
 	} else
 	} else
-		cfqq = cic->cfqq;
+		cfqq = cic->cfqq[is_sync];
 
 
 	cfqq->allocated[rw]++;
 	cfqq->allocated[rw]++;
 	cfq_clear_cfqq_must_alloc(cfqq);
 	cfq_clear_cfqq_must_alloc(cfqq);
@@ -1924,7 +1996,7 @@ cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
 		crq->cfq_queue = cfqq;
 		crq->cfq_queue = cfqq;
 		crq->io_context = cic;
 		crq->io_context = cic;
 
 
-		if (rw == READ || process_sync(tsk))
+		if (is_sync)
 			cfq_mark_crq_is_sync(crq);
 			cfq_mark_crq_is_sync(crq);
 		else
 		else
 			cfq_clear_crq_is_sync(crq);
 			cfq_clear_crq_is_sync(crq);
@@ -2055,15 +2127,35 @@ static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
 	blk_sync_queue(cfqd->queue);
 	blk_sync_queue(cfqd->queue);
 }
 }
 
 
-static void cfq_put_cfqd(struct cfq_data *cfqd)
+static void cfq_exit_queue(elevator_t *e)
 {
 {
+	struct cfq_data *cfqd = e->elevator_data;
 	request_queue_t *q = cfqd->queue;
 	request_queue_t *q = cfqd->queue;
 
 
-	if (!atomic_dec_and_test(&cfqd->ref))
-		return;
+	cfq_shutdown_timer_wq(cfqd);
+	write_lock(&cfq_exit_lock);
+	spin_lock_irq(q->queue_lock);
+	if (cfqd->active_queue)
+		__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
+	while(!list_empty(&cfqd->cic_list)) {
+		struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
+							struct cfq_io_context,
+							queue_list);
+		if (cic->cfqq[ASYNC]) {
+			cfq_put_queue(cic->cfqq[ASYNC]);
+			cic->cfqq[ASYNC] = NULL;
+		}
+		if (cic->cfqq[SYNC]) {
+			cfq_put_queue(cic->cfqq[SYNC]);
+			cic->cfqq[SYNC] = NULL;
+		}
+		cic->key = NULL;
+		list_del_init(&cic->queue_list);
+	}
+	spin_unlock_irq(q->queue_lock);
+	write_unlock(&cfq_exit_lock);
 
 
 	cfq_shutdown_timer_wq(cfqd);
 	cfq_shutdown_timer_wq(cfqd);
-	blk_put_queue(q);
 
 
 	mempool_destroy(cfqd->crq_pool);
 	mempool_destroy(cfqd->crq_pool);
 	kfree(cfqd->crq_hash);
 	kfree(cfqd->crq_hash);
@@ -2071,14 +2163,6 @@ static void cfq_put_cfqd(struct cfq_data *cfqd)
 	kfree(cfqd);
 	kfree(cfqd);
 }
 }
 
 
-static void cfq_exit_queue(elevator_t *e)
-{
-	struct cfq_data *cfqd = e->elevator_data;
-
-	cfq_shutdown_timer_wq(cfqd);
-	cfq_put_cfqd(cfqd);
-}
-
 static int cfq_init_queue(request_queue_t *q, elevator_t *e)
 static int cfq_init_queue(request_queue_t *q, elevator_t *e)
 {
 {
 	struct cfq_data *cfqd;
 	struct cfq_data *cfqd;
@@ -2097,6 +2181,7 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e)
 	INIT_LIST_HEAD(&cfqd->cur_rr);
 	INIT_LIST_HEAD(&cfqd->cur_rr);
 	INIT_LIST_HEAD(&cfqd->idle_rr);
 	INIT_LIST_HEAD(&cfqd->idle_rr);
 	INIT_LIST_HEAD(&cfqd->empty_list);
 	INIT_LIST_HEAD(&cfqd->empty_list);
+	INIT_LIST_HEAD(&cfqd->cic_list);
 
 
 	cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
 	cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
 	if (!cfqd->crq_hash)
 	if (!cfqd->crq_hash)
@@ -2118,7 +2203,6 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e)
 	e->elevator_data = cfqd;
 	e->elevator_data = cfqd;
 
 
 	cfqd->queue = q;
 	cfqd->queue = q;
-	atomic_inc(&q->refcnt);
 
 
 	cfqd->max_queued = q->nr_requests / 4;
 	cfqd->max_queued = q->nr_requests / 4;
 	q->nr_batching = cfq_queued;
 	q->nr_batching = cfq_queued;
@@ -2133,8 +2217,6 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e)
 
 
 	INIT_WORK(&cfqd->unplug_work, cfq_kick_queue, q);
 	INIT_WORK(&cfqd->unplug_work, cfq_kick_queue, q);
 
 
-	atomic_set(&cfqd->ref, 1);
-
 	cfqd->cfq_queued = cfq_queued;
 	cfqd->cfq_queued = cfq_queued;
 	cfqd->cfq_quantum = cfq_quantum;
 	cfqd->cfq_quantum = cfq_quantum;
 	cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
 	cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
@@ -2193,11 +2275,6 @@ fail:
 /*
 /*
  * sysfs parts below -->
  * sysfs parts below -->
  */
  */
-struct cfq_fs_entry {
-	struct attribute attr;
-	ssize_t (*show)(struct cfq_data *, char *);
-	ssize_t (*store)(struct cfq_data *, const char *, size_t);
-};
 
 
 static ssize_t
 static ssize_t
 cfq_var_show(unsigned int var, char *page)
 cfq_var_show(unsigned int var, char *page)
@@ -2215,8 +2292,9 @@ cfq_var_store(unsigned int *var, const char *page, size_t count)
 }
 }
 
 
 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
-static ssize_t __FUNC(struct cfq_data *cfqd, char *page)		\
+static ssize_t __FUNC(elevator_t *e, char *page)			\
 {									\
 {									\
+	struct cfq_data *cfqd = e->elevator_data;			\
 	unsigned int __data = __VAR;					\
 	unsigned int __data = __VAR;					\
 	if (__CONV)							\
 	if (__CONV)							\
 		__data = jiffies_to_msecs(__data);			\
 		__data = jiffies_to_msecs(__data);			\
@@ -2226,8 +2304,8 @@ SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
 SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued, 0);
 SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued, 0);
 SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
 SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
 SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
 SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
-SHOW_FUNCTION(cfq_back_max_show, cfqd->cfq_back_max, 0);
-SHOW_FUNCTION(cfq_back_penalty_show, cfqd->cfq_back_penalty, 0);
+SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
+SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
 SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
 SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
 SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
 SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
@@ -2236,8 +2314,9 @@ SHOW_FUNCTION(cfq_max_depth_show, cfqd->cfq_max_depth, 0);
 #undef SHOW_FUNCTION
 #undef SHOW_FUNCTION
 
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
-static ssize_t __FUNC(struct cfq_data *cfqd, const char *page, size_t count)	\
+static ssize_t __FUNC(elevator_t *e, const char *page, size_t count)	\
 {									\
 {									\
+	struct cfq_data *cfqd = e->elevator_data;			\
 	unsigned int __data;						\
 	unsigned int __data;						\
 	int ret = cfq_var_store(&__data, (page), count);		\
 	int ret = cfq_var_store(&__data, (page), count);		\
 	if (__data < (MIN))						\
 	if (__data < (MIN))						\
@@ -2254,8 +2333,8 @@ STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
 STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, UINT_MAX, 0);
 STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, UINT_MAX, 0);
 STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, UINT_MAX, 1);
-STORE_FUNCTION(cfq_back_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
-STORE_FUNCTION(cfq_back_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0);
+STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
+STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0);
 STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
@@ -2263,112 +2342,22 @@ STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX,
 STORE_FUNCTION(cfq_max_depth_store, &cfqd->cfq_max_depth, 1, UINT_MAX, 0);
 STORE_FUNCTION(cfq_max_depth_store, &cfqd->cfq_max_depth, 1, UINT_MAX, 0);
 #undef STORE_FUNCTION
 #undef STORE_FUNCTION
 
 
-static struct cfq_fs_entry cfq_quantum_entry = {
-	.attr = {.name = "quantum", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_quantum_show,
-	.store = cfq_quantum_store,
-};
-static struct cfq_fs_entry cfq_queued_entry = {
-	.attr = {.name = "queued", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_queued_show,
-	.store = cfq_queued_store,
-};
-static struct cfq_fs_entry cfq_fifo_expire_sync_entry = {
-	.attr = {.name = "fifo_expire_sync", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_fifo_expire_sync_show,
-	.store = cfq_fifo_expire_sync_store,
-};
-static struct cfq_fs_entry cfq_fifo_expire_async_entry = {
-	.attr = {.name = "fifo_expire_async", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_fifo_expire_async_show,
-	.store = cfq_fifo_expire_async_store,
-};
-static struct cfq_fs_entry cfq_back_max_entry = {
-	.attr = {.name = "back_seek_max", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_back_max_show,
-	.store = cfq_back_max_store,
-};
-static struct cfq_fs_entry cfq_back_penalty_entry = {
-	.attr = {.name = "back_seek_penalty", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_back_penalty_show,
-	.store = cfq_back_penalty_store,
-};
-static struct cfq_fs_entry cfq_slice_sync_entry = {
-	.attr = {.name = "slice_sync", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_slice_sync_show,
-	.store = cfq_slice_sync_store,
-};
-static struct cfq_fs_entry cfq_slice_async_entry = {
-	.attr = {.name = "slice_async", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_slice_async_show,
-	.store = cfq_slice_async_store,
-};
-static struct cfq_fs_entry cfq_slice_async_rq_entry = {
-	.attr = {.name = "slice_async_rq", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_slice_async_rq_show,
-	.store = cfq_slice_async_rq_store,
-};
-static struct cfq_fs_entry cfq_slice_idle_entry = {
-	.attr = {.name = "slice_idle", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_slice_idle_show,
-	.store = cfq_slice_idle_store,
-};
-static struct cfq_fs_entry cfq_max_depth_entry = {
-	.attr = {.name = "max_depth", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_max_depth_show,
-	.store = cfq_max_depth_store,
-};
-
-static struct attribute *default_attrs[] = {
-	&cfq_quantum_entry.attr,
-	&cfq_queued_entry.attr,
-	&cfq_fifo_expire_sync_entry.attr,
-	&cfq_fifo_expire_async_entry.attr,
-	&cfq_back_max_entry.attr,
-	&cfq_back_penalty_entry.attr,
-	&cfq_slice_sync_entry.attr,
-	&cfq_slice_async_entry.attr,
-	&cfq_slice_async_rq_entry.attr,
-	&cfq_slice_idle_entry.attr,
-	&cfq_max_depth_entry.attr,
-	NULL,
-};
-
-#define to_cfq(atr) container_of((atr), struct cfq_fs_entry, attr)
-
-static ssize_t
-cfq_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
-{
-	elevator_t *e = container_of(kobj, elevator_t, kobj);
-	struct cfq_fs_entry *entry = to_cfq(attr);
-
-	if (!entry->show)
-		return -EIO;
-
-	return entry->show(e->elevator_data, page);
-}
-
-static ssize_t
-cfq_attr_store(struct kobject *kobj, struct attribute *attr,
-	       const char *page, size_t length)
-{
-	elevator_t *e = container_of(kobj, elevator_t, kobj);
-	struct cfq_fs_entry *entry = to_cfq(attr);
-
-	if (!entry->store)
-		return -EIO;
-
-	return entry->store(e->elevator_data, page, length);
-}
-
-static struct sysfs_ops cfq_sysfs_ops = {
-	.show	= cfq_attr_show,
-	.store	= cfq_attr_store,
-};
-
-static struct kobj_type cfq_ktype = {
-	.sysfs_ops	= &cfq_sysfs_ops,
-	.default_attrs	= default_attrs,
+#define CFQ_ATTR(name) \
+	__ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)
+
+static struct elv_fs_entry cfq_attrs[] = {
+	CFQ_ATTR(quantum),
+	CFQ_ATTR(queued),
+	CFQ_ATTR(fifo_expire_sync),
+	CFQ_ATTR(fifo_expire_async),
+	CFQ_ATTR(back_seek_max),
+	CFQ_ATTR(back_seek_penalty),
+	CFQ_ATTR(slice_sync),
+	CFQ_ATTR(slice_async),
+	CFQ_ATTR(slice_async_rq),
+	CFQ_ATTR(slice_idle),
+	CFQ_ATTR(max_depth),
+	__ATTR_NULL
 };
 };
 
 
 static struct elevator_type iosched_cfq = {
 static struct elevator_type iosched_cfq = {
@@ -2389,8 +2378,9 @@ static struct elevator_type iosched_cfq = {
 		.elevator_may_queue_fn =	cfq_may_queue,
 		.elevator_may_queue_fn =	cfq_may_queue,
 		.elevator_init_fn =		cfq_init_queue,
 		.elevator_init_fn =		cfq_init_queue,
 		.elevator_exit_fn =		cfq_exit_queue,
 		.elevator_exit_fn =		cfq_exit_queue,
+		.trim =				cfq_trim,
 	},
 	},
-	.elevator_ktype =	&cfq_ktype,
+	.elevator_attrs =	cfq_attrs,
 	.elevator_name =	"cfq",
 	.elevator_name =	"cfq",
 	.elevator_owner =	THIS_MODULE,
 	.elevator_owner =	THIS_MODULE,
 };
 };
@@ -2419,7 +2409,13 @@ static int __init cfq_init(void)
 
 
 static void __exit cfq_exit(void)
 static void __exit cfq_exit(void)
 {
 {
+	DECLARE_COMPLETION(all_gone);
 	elv_unregister(&iosched_cfq);
 	elv_unregister(&iosched_cfq);
+	ioc_gone = &all_gone;
+	barrier();
+	if (atomic_read(&ioc_count))
+		complete(ioc_gone);
+	synchronize_rcu();
 	cfq_slab_kill();
 	cfq_slab_kill();
 }
 }
 
 

+ 27 - 89
block/deadline-iosched.c

@@ -694,11 +694,6 @@ deadline_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
 /*
 /*
  * sysfs parts below
  * sysfs parts below
  */
  */
-struct deadline_fs_entry {
-	struct attribute attr;
-	ssize_t (*show)(struct deadline_data *, char *);
-	ssize_t (*store)(struct deadline_data *, const char *, size_t);
-};
 
 
 static ssize_t
 static ssize_t
 deadline_var_show(int var, char *page)
 deadline_var_show(int var, char *page)
@@ -716,23 +711,25 @@ deadline_var_store(int *var, const char *page, size_t count)
 }
 }
 
 
 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
-static ssize_t __FUNC(struct deadline_data *dd, char *page)		\
+static ssize_t __FUNC(elevator_t *e, char *page)			\
 {									\
 {									\
-	int __data = __VAR;					\
+	struct deadline_data *dd = e->elevator_data;			\
+	int __data = __VAR;						\
 	if (__CONV)							\
 	if (__CONV)							\
 		__data = jiffies_to_msecs(__data);			\
 		__data = jiffies_to_msecs(__data);			\
 	return deadline_var_show(__data, (page));			\
 	return deadline_var_show(__data, (page));			\
 }
 }
-SHOW_FUNCTION(deadline_readexpire_show, dd->fifo_expire[READ], 1);
-SHOW_FUNCTION(deadline_writeexpire_show, dd->fifo_expire[WRITE], 1);
-SHOW_FUNCTION(deadline_writesstarved_show, dd->writes_starved, 0);
-SHOW_FUNCTION(deadline_frontmerges_show, dd->front_merges, 0);
-SHOW_FUNCTION(deadline_fifobatch_show, dd->fifo_batch, 0);
+SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1);
+SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1);
+SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0);
+SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0);
+SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0);
 #undef SHOW_FUNCTION
 #undef SHOW_FUNCTION
 
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
-static ssize_t __FUNC(struct deadline_data *dd, const char *page, size_t count)	\
+static ssize_t __FUNC(elevator_t *e, const char *page, size_t count)	\
 {									\
 {									\
+	struct deadline_data *dd = e->elevator_data;			\
 	int __data;							\
 	int __data;							\
 	int ret = deadline_var_store(&__data, (page), count);		\
 	int ret = deadline_var_store(&__data, (page), count);		\
 	if (__data < (MIN))						\
 	if (__data < (MIN))						\
@@ -745,83 +742,24 @@ static ssize_t __FUNC(struct deadline_data *dd, const char *page, size_t count)
 		*(__PTR) = __data;					\
 		*(__PTR) = __data;					\
 	return ret;							\
 	return ret;							\
 }
 }
-STORE_FUNCTION(deadline_readexpire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
-STORE_FUNCTION(deadline_writeexpire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
-STORE_FUNCTION(deadline_writesstarved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
-STORE_FUNCTION(deadline_frontmerges_store, &dd->front_merges, 0, 1, 0);
-STORE_FUNCTION(deadline_fifobatch_store, &dd->fifo_batch, 0, INT_MAX, 0);
+STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
+STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
+STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
+STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0);
+STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
 #undef STORE_FUNCTION
 #undef STORE_FUNCTION
 
 
-static struct deadline_fs_entry deadline_readexpire_entry = {
-	.attr = {.name = "read_expire", .mode = S_IRUGO | S_IWUSR },
-	.show = deadline_readexpire_show,
-	.store = deadline_readexpire_store,
-};
-static struct deadline_fs_entry deadline_writeexpire_entry = {
-	.attr = {.name = "write_expire", .mode = S_IRUGO | S_IWUSR },
-	.show = deadline_writeexpire_show,
-	.store = deadline_writeexpire_store,
-};
-static struct deadline_fs_entry deadline_writesstarved_entry = {
-	.attr = {.name = "writes_starved", .mode = S_IRUGO | S_IWUSR },
-	.show = deadline_writesstarved_show,
-	.store = deadline_writesstarved_store,
-};
-static struct deadline_fs_entry deadline_frontmerges_entry = {
-	.attr = {.name = "front_merges", .mode = S_IRUGO | S_IWUSR },
-	.show = deadline_frontmerges_show,
-	.store = deadline_frontmerges_store,
-};
-static struct deadline_fs_entry deadline_fifobatch_entry = {
-	.attr = {.name = "fifo_batch", .mode = S_IRUGO | S_IWUSR },
-	.show = deadline_fifobatch_show,
-	.store = deadline_fifobatch_store,
-};
-
-static struct attribute *default_attrs[] = {
-	&deadline_readexpire_entry.attr,
-	&deadline_writeexpire_entry.attr,
-	&deadline_writesstarved_entry.attr,
-	&deadline_frontmerges_entry.attr,
-	&deadline_fifobatch_entry.attr,
-	NULL,
-};
-
-#define to_deadline(atr) container_of((atr), struct deadline_fs_entry, attr)
-
-static ssize_t
-deadline_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
-{
-	elevator_t *e = container_of(kobj, elevator_t, kobj);
-	struct deadline_fs_entry *entry = to_deadline(attr);
-
-	if (!entry->show)
-		return -EIO;
-
-	return entry->show(e->elevator_data, page);
-}
-
-static ssize_t
-deadline_attr_store(struct kobject *kobj, struct attribute *attr,
-		    const char *page, size_t length)
-{
-	elevator_t *e = container_of(kobj, elevator_t, kobj);
-	struct deadline_fs_entry *entry = to_deadline(attr);
-
-	if (!entry->store)
-		return -EIO;
-
-	return entry->store(e->elevator_data, page, length);
-}
-
-static struct sysfs_ops deadline_sysfs_ops = {
-	.show	= deadline_attr_show,
-	.store	= deadline_attr_store,
-};
-
-static struct kobj_type deadline_ktype = {
-	.sysfs_ops	= &deadline_sysfs_ops,
-	.default_attrs	= default_attrs,
+#define DD_ATTR(name) \
+	__ATTR(name, S_IRUGO|S_IWUSR, deadline_##name##_show, \
+				      deadline_##name##_store)
+
+static struct elv_fs_entry deadline_attrs[] = {
+	DD_ATTR(read_expire),
+	DD_ATTR(write_expire),
+	DD_ATTR(writes_starved),
+	DD_ATTR(front_merges),
+	DD_ATTR(fifo_batch),
+	__ATTR_NULL
 };
 };
 
 
 static struct elevator_type iosched_deadline = {
 static struct elevator_type iosched_deadline = {
@@ -840,7 +778,7 @@ static struct elevator_type iosched_deadline = {
 		.elevator_exit_fn =		deadline_exit_queue,
 		.elevator_exit_fn =		deadline_exit_queue,
 	},
 	},
 
 
-	.elevator_ktype = &deadline_ktype,
+	.elevator_attrs = deadline_attrs,
 	.elevator_name = "deadline",
 	.elevator_name = "deadline",
 	.elevator_owner = THIS_MODULE,
 	.elevator_owner = THIS_MODULE,
 };
 };

+ 118 - 53
block/elevator.c

@@ -120,15 +120,10 @@ static struct elevator_type *elevator_get(const char *name)
 	return e;
 	return e;
 }
 }
 
 
-static int elevator_attach(request_queue_t *q, struct elevator_type *e,
-			   struct elevator_queue *eq)
+static int elevator_attach(request_queue_t *q, struct elevator_queue *eq)
 {
 {
 	int ret = 0;
 	int ret = 0;
 
 
-	memset(eq, 0, sizeof(*eq));
-	eq->ops = &e->ops;
-	eq->elevator_type = e;
-
 	q->elevator = eq;
 	q->elevator = eq;
 
 
 	if (eq->ops->elevator_init_fn)
 	if (eq->ops->elevator_init_fn)
@@ -154,6 +149,32 @@ static int __init elevator_setup(char *str)
 
 
 __setup("elevator=", elevator_setup);
 __setup("elevator=", elevator_setup);
 
 
+static struct kobj_type elv_ktype;
+
+static elevator_t *elevator_alloc(struct elevator_type *e)
+{
+	elevator_t *eq = kmalloc(sizeof(elevator_t), GFP_KERNEL);
+	if (eq) {
+		memset(eq, 0, sizeof(*eq));
+		eq->ops = &e->ops;
+		eq->elevator_type = e;
+		kobject_init(&eq->kobj);
+		snprintf(eq->kobj.name, KOBJ_NAME_LEN, "%s", "iosched");
+		eq->kobj.ktype = &elv_ktype;
+		mutex_init(&eq->sysfs_lock);
+	} else {
+		elevator_put(e);
+	}
+	return eq;
+}
+
+static void elevator_release(struct kobject *kobj)
+{
+	elevator_t *e = container_of(kobj, elevator_t, kobj);
+	elevator_put(e->elevator_type);
+	kfree(e);
+}
+
 int elevator_init(request_queue_t *q, char *name)
 int elevator_init(request_queue_t *q, char *name)
 {
 {
 	struct elevator_type *e = NULL;
 	struct elevator_type *e = NULL;
@@ -176,29 +197,26 @@ int elevator_init(request_queue_t *q, char *name)
 		e = elevator_get("noop");
 		e = elevator_get("noop");
 	}
 	}
 
 
-	eq = kmalloc(sizeof(struct elevator_queue), GFP_KERNEL);
-	if (!eq) {
-		elevator_put(e);
+	eq = elevator_alloc(e);
+	if (!eq)
 		return -ENOMEM;
 		return -ENOMEM;
-	}
 
 
-	ret = elevator_attach(q, e, eq);
-	if (ret) {
-		kfree(eq);
-		elevator_put(e);
-	}
+	ret = elevator_attach(q, eq);
+	if (ret)
+		kobject_put(&eq->kobj);
 
 
 	return ret;
 	return ret;
 }
 }
 
 
 void elevator_exit(elevator_t *e)
 void elevator_exit(elevator_t *e)
 {
 {
+	mutex_lock(&e->sysfs_lock);
 	if (e->ops->elevator_exit_fn)
 	if (e->ops->elevator_exit_fn)
 		e->ops->elevator_exit_fn(e);
 		e->ops->elevator_exit_fn(e);
+	e->ops = NULL;
+	mutex_unlock(&e->sysfs_lock);
 
 
-	elevator_put(e->elevator_type);
-	e->elevator_type = NULL;
-	kfree(e);
+	kobject_put(&e->kobj);
 }
 }
 
 
 /*
 /*
@@ -627,26 +645,79 @@ void elv_completed_request(request_queue_t *q, struct request *rq)
 	}
 	}
 }
 }
 
 
-int elv_register_queue(struct request_queue *q)
+#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)
+
+static ssize_t
+elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
 {
 {
-	elevator_t *e = q->elevator;
+	elevator_t *e = container_of(kobj, elevator_t, kobj);
+	struct elv_fs_entry *entry = to_elv(attr);
+	ssize_t error;
 
 
-	e->kobj.parent = kobject_get(&q->kobj);
-	if (!e->kobj.parent)
-		return -EBUSY;
+	if (!entry->show)
+		return -EIO;
 
 
-	snprintf(e->kobj.name, KOBJ_NAME_LEN, "%s", "iosched");
-	e->kobj.ktype = e->elevator_type->elevator_ktype;
+	mutex_lock(&e->sysfs_lock);
+	error = e->ops ? entry->show(e, page) : -ENOENT;
+	mutex_unlock(&e->sysfs_lock);
+	return error;
+}
+
+static ssize_t
+elv_attr_store(struct kobject *kobj, struct attribute *attr,
+	       const char *page, size_t length)
+{
+	elevator_t *e = container_of(kobj, elevator_t, kobj);
+	struct elv_fs_entry *entry = to_elv(attr);
+	ssize_t error;
+
+	if (!entry->store)
+		return -EIO;
+
+	mutex_lock(&e->sysfs_lock);
+	error = e->ops ? entry->store(e, page, length) : -ENOENT;
+	mutex_unlock(&e->sysfs_lock);
+	return error;
+}
+
+static struct sysfs_ops elv_sysfs_ops = {
+	.show	= elv_attr_show,
+	.store	= elv_attr_store,
+};
+
+static struct kobj_type elv_ktype = {
+	.sysfs_ops	= &elv_sysfs_ops,
+	.release	= elevator_release,
+};
 
 
-	return kobject_register(&e->kobj);
+int elv_register_queue(struct request_queue *q)
+{
+	elevator_t *e = q->elevator;
+	int error;
+
+	e->kobj.parent = &q->kobj;
+
+	error = kobject_add(&e->kobj);
+	if (!error) {
+		struct elv_fs_entry *attr = e->elevator_type->elevator_attrs;
+		if (attr) {
+			while (attr->attr.name) {
+				if (sysfs_create_file(&e->kobj, &attr->attr))
+					break;
+				attr++;
+			}
+		}
+		kobject_uevent(&e->kobj, KOBJ_ADD);
+	}
+	return error;
 }
 }
 
 
 void elv_unregister_queue(struct request_queue *q)
 void elv_unregister_queue(struct request_queue *q)
 {
 {
 	if (q) {
 	if (q) {
 		elevator_t *e = q->elevator;
 		elevator_t *e = q->elevator;
-		kobject_unregister(&e->kobj);
-		kobject_put(&q->kobj);
+		kobject_uevent(&e->kobj, KOBJ_REMOVE);
+		kobject_del(&e->kobj);
 	}
 	}
 }
 }
 
 
@@ -675,21 +746,15 @@ void elv_unregister(struct elevator_type *e)
 	/*
 	/*
 	 * Iterate every thread in the process to remove the io contexts.
 	 * Iterate every thread in the process to remove the io contexts.
 	 */
 	 */
-	read_lock(&tasklist_lock);
-	do_each_thread(g, p) {
-		struct io_context *ioc = p->io_context;
-		if (ioc && ioc->cic) {
-			ioc->cic->exit(ioc->cic);
-			ioc->cic->dtor(ioc->cic);
-			ioc->cic = NULL;
-		}
-		if (ioc && ioc->aic) {
-			ioc->aic->exit(ioc->aic);
-			ioc->aic->dtor(ioc->aic);
-			ioc->aic = NULL;
-		}
-	} while_each_thread(g, p);
-	read_unlock(&tasklist_lock);
+	if (e->ops.trim) {
+		read_lock(&tasklist_lock);
+		do_each_thread(g, p) {
+			task_lock(p);
+			e->ops.trim(p->io_context);
+			task_unlock(p);
+		} while_each_thread(g, p);
+		read_unlock(&tasklist_lock);
+	}
 
 
 	spin_lock_irq(&elv_list_lock);
 	spin_lock_irq(&elv_list_lock);
 	list_del_init(&e->list);
 	list_del_init(&e->list);
@@ -703,16 +768,16 @@ EXPORT_SYMBOL_GPL(elv_unregister);
  * need for the new one. this way we have a chance of going back to the old
  * need for the new one. this way we have a chance of going back to the old
  * one, if the new one fails init for some reason.
  * one, if the new one fails init for some reason.
  */
  */
-static void elevator_switch(request_queue_t *q, struct elevator_type *new_e)
+static int elevator_switch(request_queue_t *q, struct elevator_type *new_e)
 {
 {
 	elevator_t *old_elevator, *e;
 	elevator_t *old_elevator, *e;
 
 
 	/*
 	/*
 	 * Allocate new elevator
 	 * Allocate new elevator
 	 */
 	 */
-	e = kmalloc(sizeof(elevator_t), GFP_KERNEL);
+	e = elevator_alloc(new_e);
 	if (!e)
 	if (!e)
-		goto error;
+		return 0;
 
 
 	/*
 	/*
 	 * Turn on BYPASS and drain all requests w/ elevator private data
 	 * Turn on BYPASS and drain all requests w/ elevator private data
@@ -743,7 +808,7 @@ static void elevator_switch(request_queue_t *q, struct elevator_type *new_e)
 	/*
 	/*
 	 * attach and start new elevator
 	 * attach and start new elevator
 	 */
 	 */
-	if (elevator_attach(q, new_e, e))
+	if (elevator_attach(q, e))
 		goto fail;
 		goto fail;
 
 
 	if (elv_register_queue(q))
 	if (elv_register_queue(q))
@@ -754,7 +819,7 @@ static void elevator_switch(request_queue_t *q, struct elevator_type *new_e)
 	 */
 	 */
 	elevator_exit(old_elevator);
 	elevator_exit(old_elevator);
 	clear_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
 	clear_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
-	return;
+	return 1;
 
 
 fail_register:
 fail_register:
 	/*
 	/*
@@ -767,10 +832,9 @@ fail:
 	q->elevator = old_elevator;
 	q->elevator = old_elevator;
 	elv_register_queue(q);
 	elv_register_queue(q);
 	clear_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
 	clear_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
-	kfree(e);
-error:
-	elevator_put(new_e);
-	printk(KERN_ERR "elevator: switch to %s failed\n",new_e->elevator_name);
+	if (e)
+		kobject_put(&e->kobj);
+	return 0;
 }
 }
 
 
 ssize_t elv_iosched_store(request_queue_t *q, const char *name, size_t count)
 ssize_t elv_iosched_store(request_queue_t *q, const char *name, size_t count)
@@ -797,7 +861,8 @@ ssize_t elv_iosched_store(request_queue_t *q, const char *name, size_t count)
 		return count;
 		return count;
 	}
 	}
 
 
-	elevator_switch(q, e);
+	if (!elevator_switch(q, e))
+		printk(KERN_ERR "elevator: switch to %s failed\n",elevator_name);
 	return count;
 	return count;
 }
 }
 
 

+ 72 - 33
block/ll_rw_blk.c

@@ -1740,16 +1740,11 @@ EXPORT_SYMBOL(blk_run_queue);
  *     Hopefully the low level driver will have finished any
  *     Hopefully the low level driver will have finished any
  *     outstanding requests first...
  *     outstanding requests first...
  **/
  **/
-void blk_cleanup_queue(request_queue_t * q)
+static void blk_release_queue(struct kobject *kobj)
 {
 {
+	request_queue_t *q = container_of(kobj, struct request_queue, kobj);
 	struct request_list *rl = &q->rq;
 	struct request_list *rl = &q->rq;
 
 
-	if (!atomic_dec_and_test(&q->refcnt))
-		return;
-
-	if (q->elevator)
-		elevator_exit(q->elevator);
-
 	blk_sync_queue(q);
 	blk_sync_queue(q);
 
 
 	if (rl->rq_pool)
 	if (rl->rq_pool)
@@ -1761,6 +1756,24 @@ void blk_cleanup_queue(request_queue_t * q)
 	kmem_cache_free(requestq_cachep, q);
 	kmem_cache_free(requestq_cachep, q);
 }
 }
 
 
+void blk_put_queue(request_queue_t *q)
+{
+	kobject_put(&q->kobj);
+}
+EXPORT_SYMBOL(blk_put_queue);
+
+void blk_cleanup_queue(request_queue_t * q)
+{
+	mutex_lock(&q->sysfs_lock);
+	set_bit(QUEUE_FLAG_DEAD, &q->queue_flags);
+	mutex_unlock(&q->sysfs_lock);
+
+	if (q->elevator)
+		elevator_exit(q->elevator);
+
+	blk_put_queue(q);
+}
+
 EXPORT_SYMBOL(blk_cleanup_queue);
 EXPORT_SYMBOL(blk_cleanup_queue);
 
 
 static int blk_init_free_list(request_queue_t *q)
 static int blk_init_free_list(request_queue_t *q)
@@ -1788,6 +1801,8 @@ request_queue_t *blk_alloc_queue(gfp_t gfp_mask)
 }
 }
 EXPORT_SYMBOL(blk_alloc_queue);
 EXPORT_SYMBOL(blk_alloc_queue);
 
 
+static struct kobj_type queue_ktype;
+
 request_queue_t *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 request_queue_t *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 {
 {
 	request_queue_t *q;
 	request_queue_t *q;
@@ -1798,11 +1813,16 @@ request_queue_t *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 
 
 	memset(q, 0, sizeof(*q));
 	memset(q, 0, sizeof(*q));
 	init_timer(&q->unplug_timer);
 	init_timer(&q->unplug_timer);
-	atomic_set(&q->refcnt, 1);
+
+	snprintf(q->kobj.name, KOBJ_NAME_LEN, "%s", "queue");
+	q->kobj.ktype = &queue_ktype;
+	kobject_init(&q->kobj);
 
 
 	q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
 	q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
 	q->backing_dev_info.unplug_io_data = q;
 	q->backing_dev_info.unplug_io_data = q;
 
 
+	mutex_init(&q->sysfs_lock);
+
 	return q;
 	return q;
 }
 }
 EXPORT_SYMBOL(blk_alloc_queue_node);
 EXPORT_SYMBOL(blk_alloc_queue_node);
@@ -1854,8 +1874,10 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 		return NULL;
 		return NULL;
 
 
 	q->node = node_id;
 	q->node = node_id;
-	if (blk_init_free_list(q))
-		goto out_init;
+	if (blk_init_free_list(q)) {
+		kmem_cache_free(requestq_cachep, q);
+		return NULL;
+	}
 
 
 	/*
 	/*
 	 * if caller didn't supply a lock, they get per-queue locking with
 	 * if caller didn't supply a lock, they get per-queue locking with
@@ -1891,9 +1913,7 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 		return q;
 		return q;
 	}
 	}
 
 
-	blk_cleanup_queue(q);
-out_init:
-	kmem_cache_free(requestq_cachep, q);
+	blk_put_queue(q);
 	return NULL;
 	return NULL;
 }
 }
 EXPORT_SYMBOL(blk_init_queue_node);
 EXPORT_SYMBOL(blk_init_queue_node);
@@ -1901,7 +1921,7 @@ EXPORT_SYMBOL(blk_init_queue_node);
 int blk_get_queue(request_queue_t *q)
 int blk_get_queue(request_queue_t *q)
 {
 {
 	if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
 	if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
-		atomic_inc(&q->refcnt);
+		kobject_get(&q->kobj);
 		return 0;
 		return 0;
 	}
 	}
 
 
@@ -3477,10 +3497,12 @@ void put_io_context(struct io_context *ioc)
 	BUG_ON(atomic_read(&ioc->refcount) == 0);
 	BUG_ON(atomic_read(&ioc->refcount) == 0);
 
 
 	if (atomic_dec_and_test(&ioc->refcount)) {
 	if (atomic_dec_and_test(&ioc->refcount)) {
+		rcu_read_lock();
 		if (ioc->aic && ioc->aic->dtor)
 		if (ioc->aic && ioc->aic->dtor)
 			ioc->aic->dtor(ioc->aic);
 			ioc->aic->dtor(ioc->aic);
 		if (ioc->cic && ioc->cic->dtor)
 		if (ioc->cic && ioc->cic->dtor)
 			ioc->cic->dtor(ioc->cic);
 			ioc->cic->dtor(ioc->cic);
+		rcu_read_unlock();
 
 
 		kmem_cache_free(iocontext_cachep, ioc);
 		kmem_cache_free(iocontext_cachep, ioc);
 	}
 	}
@@ -3614,10 +3636,13 @@ static ssize_t
 queue_requests_store(struct request_queue *q, const char *page, size_t count)
 queue_requests_store(struct request_queue *q, const char *page, size_t count)
 {
 {
 	struct request_list *rl = &q->rq;
 	struct request_list *rl = &q->rq;
+	unsigned long nr;
+	int ret = queue_var_store(&nr, page, count);
+	if (nr < BLKDEV_MIN_RQ)
+		nr = BLKDEV_MIN_RQ;
 
 
-	int ret = queue_var_store(&q->nr_requests, page, count);
-	if (q->nr_requests < BLKDEV_MIN_RQ)
-		q->nr_requests = BLKDEV_MIN_RQ;
+	spin_lock_irq(q->queue_lock);
+	q->nr_requests = nr;
 	blk_queue_congestion_threshold(q);
 	blk_queue_congestion_threshold(q);
 
 
 	if (rl->count[READ] >= queue_congestion_on_threshold(q))
 	if (rl->count[READ] >= queue_congestion_on_threshold(q))
@@ -3643,6 +3668,7 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
 		blk_clear_queue_full(q, WRITE);
 		blk_clear_queue_full(q, WRITE);
 		wake_up(&rl->wait[WRITE]);
 		wake_up(&rl->wait[WRITE]);
 	}
 	}
+	spin_unlock_irq(q->queue_lock);
 	return ret;
 	return ret;
 }
 }
 
 
@@ -3758,13 +3784,19 @@ static ssize_t
 queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
 queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
 {
 {
 	struct queue_sysfs_entry *entry = to_queue(attr);
 	struct queue_sysfs_entry *entry = to_queue(attr);
-	struct request_queue *q;
+	request_queue_t *q = container_of(kobj, struct request_queue, kobj);
+	ssize_t res;
 
 
-	q = container_of(kobj, struct request_queue, kobj);
 	if (!entry->show)
 	if (!entry->show)
 		return -EIO;
 		return -EIO;
-
-	return entry->show(q, page);
+	mutex_lock(&q->sysfs_lock);
+	if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
+		mutex_unlock(&q->sysfs_lock);
+		return -ENOENT;
+	}
+	res = entry->show(q, page);
+	mutex_unlock(&q->sysfs_lock);
+	return res;
 }
 }
 
 
 static ssize_t
 static ssize_t
@@ -3772,13 +3804,20 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
 		    const char *page, size_t length)
 		    const char *page, size_t length)
 {
 {
 	struct queue_sysfs_entry *entry = to_queue(attr);
 	struct queue_sysfs_entry *entry = to_queue(attr);
-	struct request_queue *q;
+	request_queue_t *q = container_of(kobj, struct request_queue, kobj);
+
+	ssize_t res;
 
 
-	q = container_of(kobj, struct request_queue, kobj);
 	if (!entry->store)
 	if (!entry->store)
 		return -EIO;
 		return -EIO;
-
-	return entry->store(q, page, length);
+	mutex_lock(&q->sysfs_lock);
+	if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
+		mutex_unlock(&q->sysfs_lock);
+		return -ENOENT;
+	}
+	res = entry->store(q, page, length);
+	mutex_unlock(&q->sysfs_lock);
+	return res;
 }
 }
 
 
 static struct sysfs_ops queue_sysfs_ops = {
 static struct sysfs_ops queue_sysfs_ops = {
@@ -3789,6 +3828,7 @@ static struct sysfs_ops queue_sysfs_ops = {
 static struct kobj_type queue_ktype = {
 static struct kobj_type queue_ktype = {
 	.sysfs_ops	= &queue_sysfs_ops,
 	.sysfs_ops	= &queue_sysfs_ops,
 	.default_attrs	= default_attrs,
 	.default_attrs	= default_attrs,
+	.release	= blk_release_queue,
 };
 };
 
 
 int blk_register_queue(struct gendisk *disk)
 int blk_register_queue(struct gendisk *disk)
@@ -3801,19 +3841,17 @@ int blk_register_queue(struct gendisk *disk)
 		return -ENXIO;
 		return -ENXIO;
 
 
 	q->kobj.parent = kobject_get(&disk->kobj);
 	q->kobj.parent = kobject_get(&disk->kobj);
-	if (!q->kobj.parent)
-		return -EBUSY;
 
 
-	snprintf(q->kobj.name, KOBJ_NAME_LEN, "%s", "queue");
-	q->kobj.ktype = &queue_ktype;
-
-	ret = kobject_register(&q->kobj);
+	ret = kobject_add(&q->kobj);
 	if (ret < 0)
 	if (ret < 0)
 		return ret;
 		return ret;
 
 
+	kobject_uevent(&q->kobj, KOBJ_ADD);
+
 	ret = elv_register_queue(q);
 	ret = elv_register_queue(q);
 	if (ret) {
 	if (ret) {
-		kobject_unregister(&q->kobj);
+		kobject_uevent(&q->kobj, KOBJ_REMOVE);
+		kobject_del(&q->kobj);
 		return ret;
 		return ret;
 	}
 	}
 
 
@@ -3827,7 +3865,8 @@ void blk_unregister_queue(struct gendisk *disk)
 	if (q && q->request_fn) {
 	if (q && q->request_fn) {
 		elv_unregister_queue(q);
 		elv_unregister_queue(q);
 
 
-		kobject_unregister(&q->kobj);
+		kobject_uevent(&q->kobj, KOBJ_REMOVE);
+		kobject_del(&q->kobj);
 		kobject_put(&disk->kobj);
 		kobject_put(&disk->kobj);
 	}
 	}
 }
 }

+ 2 - 2
drivers/block/loop.c

@@ -1307,7 +1307,7 @@ static int __init loop_init(void)
 
 
 out_mem4:
 out_mem4:
 	while (i--)
 	while (i--)
-		blk_put_queue(loop_dev[i].lo_queue);
+		blk_cleanup_queue(loop_dev[i].lo_queue);
 	devfs_remove("loop");
 	devfs_remove("loop");
 	i = max_loop;
 	i = max_loop;
 out_mem3:
 out_mem3:
@@ -1328,7 +1328,7 @@ static void loop_exit(void)
 
 
 	for (i = 0; i < max_loop; i++) {
 	for (i = 0; i < max_loop; i++) {
 		del_gendisk(disks[i]);
 		del_gendisk(disks[i]);
-		blk_put_queue(loop_dev[i].lo_queue);
+		blk_cleanup_queue(loop_dev[i].lo_queue);
 		put_disk(disks[i]);
 		put_disk(disks[i]);
 	}
 	}
 	devfs_remove("loop");
 	devfs_remove("loop");

+ 2 - 2
drivers/block/pktcdvd.c

@@ -2514,7 +2514,7 @@ static int pkt_setup_dev(struct pkt_ctrl_command *ctrl_cmd)
 	return 0;
 	return 0;
 
 
 out_new_dev:
 out_new_dev:
-	blk_put_queue(disk->queue);
+	blk_cleanup_queue(disk->queue);
 out_mem2:
 out_mem2:
 	put_disk(disk);
 	put_disk(disk);
 out_mem:
 out_mem:
@@ -2555,7 +2555,7 @@ static int pkt_remove_dev(struct pkt_ctrl_command *ctrl_cmd)
 	DPRINTK("pktcdvd: writer %s unmapped\n", pd->name);
 	DPRINTK("pktcdvd: writer %s unmapped\n", pd->name);
 
 
 	del_gendisk(pd->disk);
 	del_gendisk(pd->disk);
-	blk_put_queue(pd->disk->queue);
+	blk_cleanup_queue(pd->disk->queue);
 	put_disk(pd->disk);
 	put_disk(pd->disk);
 
 
 	pkt_devs[idx] = NULL;
 	pkt_devs[idx] = NULL;

+ 1 - 1
drivers/block/umem.c

@@ -1131,7 +1131,7 @@ static void mm_pci_remove(struct pci_dev *dev)
 		pci_free_consistent(card->dev, PAGE_SIZE*2,
 		pci_free_consistent(card->dev, PAGE_SIZE*2,
 				    card->mm_pages[1].desc,
 				    card->mm_pages[1].desc,
 				    card->mm_pages[1].page_dma);
 				    card->mm_pages[1].page_dma);
-	blk_put_queue(card->queue);
+	blk_cleanup_queue(card->queue);
 }
 }
 
 
 static const struct pci_device_id mm_pci_ids[] = { {
 static const struct pci_device_id mm_pci_ids[] = { {

+ 2 - 2
drivers/md/dm.c

@@ -840,7 +840,7 @@ static struct mapped_device *alloc_dev(unsigned int minor, int persistent)
  bad3:
  bad3:
 	mempool_destroy(md->io_pool);
 	mempool_destroy(md->io_pool);
  bad2:
  bad2:
-	blk_put_queue(md->queue);
+	blk_cleanup_queue(md->queue);
 	free_minor(minor);
 	free_minor(minor);
  bad1:
  bad1:
 	kfree(md);
 	kfree(md);
@@ -860,7 +860,7 @@ static void free_dev(struct mapped_device *md)
 	del_gendisk(md->disk);
 	del_gendisk(md->disk);
 	free_minor(minor);
 	free_minor(minor);
 	put_disk(md->disk);
 	put_disk(md->disk);
-	blk_put_queue(md->queue);
+	blk_cleanup_queue(md->queue);
 	kfree(md);
 	kfree(md);
 }
 }
 
 

+ 4 - 1
drivers/md/md.c

@@ -213,8 +213,11 @@ static void mddev_put(mddev_t *mddev)
 		return;
 		return;
 	if (!mddev->raid_disks && list_empty(&mddev->disks)) {
 	if (!mddev->raid_disks && list_empty(&mddev->disks)) {
 		list_del(&mddev->all_mddevs);
 		list_del(&mddev->all_mddevs);
-		blk_put_queue(mddev->queue);
+		/* that blocks */
+		blk_cleanup_queue(mddev->queue);
+		/* that also blocks */
 		kobject_unregister(&mddev->kobj);
 		kobject_unregister(&mddev->kobj);
+		/* result blows... */
 	}
 	}
 	spin_unlock(&all_mddevs_lock);
 	spin_unlock(&all_mddevs_lock);
 }
 }

+ 23 - 47
drivers/net/3c509.c

@@ -100,6 +100,10 @@ static int max_interrupt_work = 10;
 static char versionA[] __initdata = DRV_NAME ".c:" DRV_VERSION " " DRV_RELDATE " becker@scyld.com\n";
 static char versionA[] __initdata = DRV_NAME ".c:" DRV_VERSION " " DRV_RELDATE " becker@scyld.com\n";
 static char versionB[] __initdata = "http://www.scyld.com/network/3c509.html\n";
 static char versionB[] __initdata = "http://www.scyld.com/network/3c509.html\n";
 
 
+#if defined(CONFIG_PM) && (defined(CONFIG_MCA) || defined(CONFIG_EISA))
+#define EL3_SUSPEND
+#endif
+
 #ifdef EL3_DEBUG
 #ifdef EL3_DEBUG
 static int el3_debug = EL3_DEBUG;
 static int el3_debug = EL3_DEBUG;
 #else
 #else
@@ -174,9 +178,6 @@ struct el3_private {
 	/* skb send-queue */
 	/* skb send-queue */
 	int head, size;
 	int head, size;
 	struct sk_buff *queue[SKB_QUEUE_SIZE];
 	struct sk_buff *queue[SKB_QUEUE_SIZE];
-#ifdef CONFIG_PM_LEGACY
-	struct pm_dev *pmdev;
-#endif
 	enum {
 	enum {
 		EL3_MCA,
 		EL3_MCA,
 		EL3_PNP,
 		EL3_PNP,
@@ -201,11 +202,15 @@ static void el3_tx_timeout (struct net_device *dev);
 static void el3_down(struct net_device *dev);
 static void el3_down(struct net_device *dev);
 static void el3_up(struct net_device *dev);
 static void el3_up(struct net_device *dev);
 static struct ethtool_ops ethtool_ops;
 static struct ethtool_ops ethtool_ops;
-#ifdef CONFIG_PM_LEGACY
-static int el3_suspend(struct pm_dev *pdev);
-static int el3_resume(struct pm_dev *pdev);
-static int el3_pm_callback(struct pm_dev *pdev, pm_request_t rqst, void *data);
+#ifdef EL3_SUSPEND
+static int el3_suspend(struct device *, pm_message_t);
+static int el3_resume(struct device *);
+#else
+#define el3_suspend NULL
+#define el3_resume NULL
 #endif
 #endif
+
+
 /* generic device remove for all device types */
 /* generic device remove for all device types */
 #if defined(CONFIG_EISA) || defined(CONFIG_MCA)
 #if defined(CONFIG_EISA) || defined(CONFIG_MCA)
 static int el3_device_remove (struct device *device);
 static int el3_device_remove (struct device *device);
@@ -229,7 +234,9 @@ static struct eisa_driver el3_eisa_driver = {
 		.driver   = {
 		.driver   = {
 				.name    = "3c509",
 				.name    = "3c509",
 				.probe   = el3_eisa_probe,
 				.probe   = el3_eisa_probe,
-				.remove  = __devexit_p (el3_device_remove)
+				.remove  = __devexit_p (el3_device_remove),
+				.suspend = el3_suspend,
+				.resume  = el3_resume,
 		}
 		}
 };
 };
 #endif
 #endif
@@ -262,6 +269,8 @@ static struct mca_driver el3_mca_driver = {
 				.bus = &mca_bus_type,
 				.bus = &mca_bus_type,
 				.probe = el3_mca_probe,
 				.probe = el3_mca_probe,
 				.remove = __devexit_p(el3_device_remove),
 				.remove = __devexit_p(el3_device_remove),
+				.suspend = el3_suspend,
+				.resume  = el3_resume,
 		},
 		},
 };
 };
 #endif /* CONFIG_MCA */
 #endif /* CONFIG_MCA */
@@ -362,10 +371,6 @@ static void el3_common_remove (struct net_device *dev)
 	struct el3_private *lp = netdev_priv(dev);
 	struct el3_private *lp = netdev_priv(dev);
 
 
 	(void) lp;				/* Keep gcc quiet... */
 	(void) lp;				/* Keep gcc quiet... */
-#ifdef CONFIG_PM_LEGACY
-	if (lp->pmdev)
-		pm_unregister(lp->pmdev);
-#endif
 #if defined(__ISAPNP__)
 #if defined(__ISAPNP__)
 	if (lp->type == EL3_PNP)
 	if (lp->type == EL3_PNP)
 		pnp_device_detach(to_pnp_dev(lp->dev));
 		pnp_device_detach(to_pnp_dev(lp->dev));
@@ -572,16 +577,6 @@ no_pnp:
 	if (err)
 	if (err)
 		goto out1;
 		goto out1;
 
 
-#ifdef CONFIG_PM_LEGACY
-	/* register power management */
-	lp->pmdev = pm_register(PM_ISA_DEV, card_idx, el3_pm_callback);
-	if (lp->pmdev) {
-		struct pm_dev *p;
-		p = lp->pmdev;
-		p->data = (struct net_device *)dev;
-	}
-#endif
-
 	el3_cards++;
 	el3_cards++;
 	lp->next_dev = el3_root_dev;
 	lp->next_dev = el3_root_dev;
 	el3_root_dev = dev;
 	el3_root_dev = dev;
@@ -1480,20 +1475,17 @@ el3_up(struct net_device *dev)
 }
 }
 
 
 /* Power Management support functions */
 /* Power Management support functions */
-#ifdef CONFIG_PM_LEGACY
+#ifdef EL3_SUSPEND
 
 
 static int
 static int
-el3_suspend(struct pm_dev *pdev)
+el3_suspend(struct device *pdev, pm_message_t state)
 {
 {
 	unsigned long flags;
 	unsigned long flags;
 	struct net_device *dev;
 	struct net_device *dev;
 	struct el3_private *lp;
 	struct el3_private *lp;
 	int ioaddr;
 	int ioaddr;
 	
 	
-	if (!pdev && !pdev->data)
-		return -EINVAL;
-
-	dev = (struct net_device *)pdev->data;
+	dev = pdev->driver_data;
 	lp = netdev_priv(dev);
 	lp = netdev_priv(dev);
 	ioaddr = dev->base_addr;
 	ioaddr = dev->base_addr;
 
 
@@ -1510,17 +1502,14 @@ el3_suspend(struct pm_dev *pdev)
 }
 }
 
 
 static int
 static int
-el3_resume(struct pm_dev *pdev)
+el3_resume(struct device *pdev)
 {
 {
 	unsigned long flags;
 	unsigned long flags;
 	struct net_device *dev;
 	struct net_device *dev;
 	struct el3_private *lp;
 	struct el3_private *lp;
 	int ioaddr;
 	int ioaddr;
 	
 	
-	if (!pdev && !pdev->data)
-		return -EINVAL;
-
-	dev = (struct net_device *)pdev->data;
+	dev = pdev->driver_data;
 	lp = netdev_priv(dev);
 	lp = netdev_priv(dev);
 	ioaddr = dev->base_addr;
 	ioaddr = dev->base_addr;
 
 
@@ -1536,20 +1525,7 @@ el3_resume(struct pm_dev *pdev)
 	return 0;
 	return 0;
 }
 }
 
 
-static int
-el3_pm_callback(struct pm_dev *pdev, pm_request_t rqst, void *data)
-{
-	switch (rqst) {
-		case PM_SUSPEND:
-			return el3_suspend(pdev);
-
-		case PM_RESUME:
-			return el3_resume(pdev);
-	}
-	return 0;
-}
-
-#endif /* CONFIG_PM_LEGACY */
+#endif /* EL3_SUSPEND */
 
 
 /* Parameters that may be passed into the module. */
 /* Parameters that may be passed into the module. */
 static int debug = -1;
 static int debug = -1;

+ 5 - 4
drivers/net/3c523.c

@@ -105,6 +105,7 @@
 #include <linux/mca-legacy.h>
 #include <linux/mca-legacy.h>
 #include <linux/ethtool.h>
 #include <linux/ethtool.h>
 #include <linux/bitops.h>
 #include <linux/bitops.h>
+#include <linux/jiffies.h>
 
 
 #include <asm/uaccess.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
 #include <asm/processor.h>
@@ -658,7 +659,7 @@ static int init586(struct net_device *dev)
 
 
 	s = jiffies;		/* warning: only active with interrupts on !! */
 	s = jiffies;		/* warning: only active with interrupts on !! */
 	while (!(cfg_cmd->cmd_status & STAT_COMPL)) {
 	while (!(cfg_cmd->cmd_status & STAT_COMPL)) {
-		if (jiffies - s > 30*HZ/100)
+		if (time_after(jiffies, s + 30*HZ/100))
 			break;
 			break;
 	}
 	}
 
 
@@ -684,7 +685,7 @@ static int init586(struct net_device *dev)
 
 
 	s = jiffies;
 	s = jiffies;
 	while (!(ias_cmd->cmd_status & STAT_COMPL)) {
 	while (!(ias_cmd->cmd_status & STAT_COMPL)) {
-		if (jiffies - s > 30*HZ/100)
+		if (time_after(jiffies, s + 30*HZ/100))
 			break;
 			break;
 	}
 	}
 
 
@@ -709,7 +710,7 @@ static int init586(struct net_device *dev)
 
 
 	s = jiffies;
 	s = jiffies;
 	while (!(tdr_cmd->cmd_status & STAT_COMPL)) {
 	while (!(tdr_cmd->cmd_status & STAT_COMPL)) {
-		if (jiffies - s > 30*HZ/100) {
+		if (time_after(jiffies, s + 30*HZ/100)) {
 			printk(KERN_WARNING "%s: %d Problems while running the TDR.\n", dev->name, __LINE__);
 			printk(KERN_WARNING "%s: %d Problems while running the TDR.\n", dev->name, __LINE__);
 			result = 1;
 			result = 1;
 			break;
 			break;
@@ -798,7 +799,7 @@ static int init586(struct net_device *dev)
 			elmc_id_attn586();
 			elmc_id_attn586();
 			s = jiffies;
 			s = jiffies;
 			while (!(mc_cmd->cmd_status & STAT_COMPL)) {
 			while (!(mc_cmd->cmd_status & STAT_COMPL)) {
-				if (jiffies - s > 30*HZ/100)
+				if (time_after(jiffies, s + 30*HZ/100))
 					break;
 					break;
 			}
 			}
 			if (!(mc_cmd->cmd_status & STAT_COMPL)) {
 			if (!(mc_cmd->cmd_status & STAT_COMPL)) {

+ 4 - 3
drivers/net/3c59x.c

@@ -258,6 +258,7 @@ static int vortex_debug = 1;
 #include <linux/highmem.h>
 #include <linux/highmem.h>
 #include <linux/eisa.h>
 #include <linux/eisa.h>
 #include <linux/bitops.h>
 #include <linux/bitops.h>
+#include <linux/jiffies.h>
 #include <asm/irq.h>			/* For NR_IRQS only. */
 #include <asm/irq.h>			/* For NR_IRQS only. */
 #include <asm/io.h>
 #include <asm/io.h>
 #include <asm/uaccess.h>
 #include <asm/uaccess.h>
@@ -841,7 +842,7 @@ enum xcvr_types {
 	XCVR_100baseFx, XCVR_MII=6, XCVR_NWAY=8, XCVR_ExtMII=9, XCVR_Default=10,
 	XCVR_100baseFx, XCVR_MII=6, XCVR_NWAY=8, XCVR_ExtMII=9, XCVR_Default=10,
 };
 };
 
 
-static struct media_table {
+static const struct media_table {
 	char *name;
 	char *name;
 	unsigned int media_bits:16,		/* Bits to set in Wn4_Media register. */
 	unsigned int media_bits:16,		/* Bits to set in Wn4_Media register. */
 		mask:8,						/* The transceiver-present bit in Wn3_Config.*/
 		mask:8,						/* The transceiver-present bit in Wn3_Config.*/
@@ -1445,7 +1446,7 @@ static int __devinit vortex_probe1(struct device *gendev,
 	}
 	}
 
 
 	{
 	{
-		static const char * ram_split[] = {"5:3", "3:1", "1:1", "3:5"};
+		static const char * const ram_split[] = {"5:3", "3:1", "1:1", "3:5"};
 		unsigned int config;
 		unsigned int config;
 		EL3WINDOW(3);
 		EL3WINDOW(3);
 		vp->available_media = ioread16(ioaddr + Wn3_Options);
 		vp->available_media = ioread16(ioaddr + Wn3_Options);
@@ -2724,7 +2725,7 @@ boomerang_rx(struct net_device *dev)
 			skb = dev_alloc_skb(PKT_BUF_SZ);
 			skb = dev_alloc_skb(PKT_BUF_SZ);
 			if (skb == NULL) {
 			if (skb == NULL) {
 				static unsigned long last_jif;
 				static unsigned long last_jif;
-				if ((jiffies - last_jif) > 10 * HZ) {
+				if (time_after(jiffies, last_jif + 10 * HZ)) {
 					printk(KERN_WARNING "%s: memory shortage\n", dev->name);
 					printk(KERN_WARNING "%s: memory shortage\n", dev->name);
 					last_jif = jiffies;
 					last_jif = jiffies;
 				}
 				}

Some files were not shown because too many files changed in this diff