Merge tag 'platform-drivers-x86-v4.5-2' of git://git.infradead.org/users/dvhart/linux...
authorLinus Torvalds <torvalds@linux-foundation.org>
Sun, 24 Jan 2016 20:45:35 +0000 (12:45 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sun, 24 Jan 2016 20:45:35 +0000 (12:45 -0800)
Pull x86 platform driver updates from Darren Hart:
 "Emergency travel prevented me from completing my final testing on this
  until today.  Nothing here that couldn't wait until RC1 fixes, but I
  thought it best to get it out sooner rather than later as it does
  contain a build warning fix.

  Summary:

  A build warning fix, MAINTAINERS cleanup, and a new DMI quirk:

  ideapad-laptop:
   - Add Lenovo Yoga 700 to no_hw_rfkill dmi list

  MAINTAINERS:
   - Combine multiple telemetry entries

  intel_telemetry_debugfs:
   - Fix unused warnings in telemetry debugfs"

* tag 'platform-drivers-x86-v4.5-2' of git://git.infradead.org/users/dvhart/linux-platform-drivers-x86:
  ideapad-laptop: Add Lenovo Yoga 700 to no_hw_rfkill dmi list
  MAINTAINERS: Combine multiple telemetry entries
  intel_telemetry_debugfs: Fix unused warnings in telemetry debugfs

620 files changed:
Documentation/ABI/testing/configfs-rdma_cm [new file with mode: 0644]
Documentation/ABI/testing/sysfs-class-infiniband [new file with mode: 0644]
Documentation/devicetree/bindings/input/gpio-keys.txt
Documentation/devicetree/bindings/thermal/rockchip-thermal.txt
Documentation/infiniband/core_locking.txt
Documentation/kernel-per-CPU-kthreads.txt
Documentation/sysctl/fs.txt
MAINTAINERS
arch/arm/Kconfig.debug
arch/arm/boot/dts/r8a7740-armadillo800eva.dts
arch/arm/mach-realview/Makefile
arch/arm/mach-tegra/Kconfig
arch/arm/mach-tegra/sleep-tegra20.S
arch/arm/mach-tegra/sleep-tegra30.S
arch/arm/mm/dma-mapping.c
arch/arm64/Kconfig.platforms
arch/arm64/boot/dts/Makefile
arch/arm64/boot/dts/nvidia/Makefile [new file with mode: 0644]
arch/arm64/boot/dts/nvidia/tegra132-norrin.dts [new file with mode: 0644]
arch/arm64/boot/dts/nvidia/tegra132.dtsi [new file with mode: 0644]
arch/arm64/boot/dts/nvidia/tegra210-p2180.dtsi [new file with mode: 0644]
arch/arm64/boot/dts/nvidia/tegra210-p2371-0000.dts [new file with mode: 0644]
arch/arm64/boot/dts/nvidia/tegra210-p2371-2180.dts [new file with mode: 0644]
arch/arm64/boot/dts/nvidia/tegra210-p2530.dtsi [new file with mode: 0644]
arch/arm64/boot/dts/nvidia/tegra210-p2571.dts [new file with mode: 0644]
arch/arm64/boot/dts/nvidia/tegra210-p2595.dtsi [new file with mode: 0644]
arch/arm64/boot/dts/nvidia/tegra210-p2597.dtsi [new file with mode: 0644]
arch/arm64/boot/dts/nvidia/tegra210.dtsi [new file with mode: 0644]
arch/ia64/include/asm/unistd.h
arch/ia64/include/uapi/asm/unistd.h
arch/ia64/kernel/entry.S
arch/powerpc/platforms/cell/spufs/file.c
arch/powerpc/platforms/cell/spufs/inode.c
arch/s390/hypfs/inode.c
arch/x86/include/asm/pmem.h
block/Makefile
block/bio-integrity.c
block/blk-core.c
block/blk-iopoll.c [deleted file]
block/blk-mq.c
block/blk-timeout.c
block/blk.h
block/ioctl.c
crypto/af_alg.c
crypto/ahash.c
crypto/algif_hash.c
crypto/algif_skcipher.c
crypto/crc32c_generic.c
crypto/shash.c
crypto/skcipher.c
drivers/acpi/apei/erst.c
drivers/amba/Kconfig
drivers/base/devtmpfs.c
drivers/block/aoe/aoecmd.c
drivers/block/drbd/drbd_actlog.c
drivers/block/drbd/drbd_bitmap.c
drivers/block/drbd/drbd_debugfs.c
drivers/block/drbd/drbd_int.h
drivers/block/drbd/drbd_main.c
drivers/block/drbd/drbd_nl.c
drivers/block/drbd/drbd_proc.c
drivers/block/drbd/drbd_protocol.h
drivers/block/drbd/drbd_receiver.c
drivers/block/drbd/drbd_req.c
drivers/block/drbd/drbd_req.h
drivers/block/drbd/drbd_state.c
drivers/block/drbd/drbd_state.h
drivers/block/drbd/drbd_state_change.h [new file with mode: 0644]
drivers/block/drbd/drbd_worker.c
drivers/block/mtip32xx/mtip32xx.c
drivers/block/null_blk.c
drivers/block/rbd.c
drivers/block/sx8.c
drivers/block/xen-blkback/blkback.c
drivers/block/xen-blkback/common.h
drivers/block/xen-blkback/xenbus.c
drivers/block/xen-blkfront.c
drivers/char/mem.c
drivers/char/mspec.c
drivers/char/ps3flash.c
drivers/crypto/Kconfig
drivers/crypto/atmel-aes.c
drivers/crypto/qat/qat_common/qat_hal.c
drivers/gpu/drm/drm_hashtab.c
drivers/infiniband/Kconfig
drivers/infiniband/core/Makefile
drivers/infiniband/core/addr.c
drivers/infiniband/core/cache.c
drivers/infiniband/core/cm.c
drivers/infiniband/core/cma.c
drivers/infiniband/core/cma_configfs.c [new file with mode: 0644]
drivers/infiniband/core/core_priv.h
drivers/infiniband/core/cq.c [new file with mode: 0644]
drivers/infiniband/core/device.c
drivers/infiniband/core/fmr_pool.c
drivers/infiniband/core/mad.c
drivers/infiniband/core/mad_priv.h
drivers/infiniband/core/multicast.c
drivers/infiniband/core/roce_gid_mgmt.c
drivers/infiniband/core/sa_query.c
drivers/infiniband/core/sysfs.c
drivers/infiniband/core/ud_header.c
drivers/infiniband/core/umem_odp.c
drivers/infiniband/core/user_mad.c
drivers/infiniband/core/uverbs.h
drivers/infiniband/core/uverbs_cmd.c
drivers/infiniband/core/uverbs_main.c
drivers/infiniband/core/uverbs_marshall.c
drivers/infiniband/core/verbs.c
drivers/infiniband/hw/cxgb3/iwch_cm.c
drivers/infiniband/hw/cxgb3/iwch_cq.c
drivers/infiniband/hw/cxgb3/iwch_mem.c
drivers/infiniband/hw/cxgb3/iwch_provider.c
drivers/infiniband/hw/cxgb3/iwch_provider.h
drivers/infiniband/hw/cxgb3/iwch_qp.c
drivers/infiniband/hw/cxgb4/cm.c
drivers/infiniband/hw/cxgb4/cq.c
drivers/infiniband/hw/cxgb4/device.c
drivers/infiniband/hw/cxgb4/iw_cxgb4.h
drivers/infiniband/hw/cxgb4/mem.c
drivers/infiniband/hw/cxgb4/provider.c
drivers/infiniband/hw/cxgb4/qp.c
drivers/infiniband/hw/cxgb4/t4.h
drivers/infiniband/hw/cxgb4/user.h
drivers/infiniband/hw/mlx4/ah.c
drivers/infiniband/hw/mlx4/cq.c
drivers/infiniband/hw/mlx4/main.c
drivers/infiniband/hw/mlx4/mlx4_ib.h
drivers/infiniband/hw/mlx4/mr.c
drivers/infiniband/hw/mlx4/qp.c
drivers/infiniband/hw/mlx4/srq.c
drivers/infiniband/hw/mlx5/ah.c
drivers/infiniband/hw/mlx5/cq.c
drivers/infiniband/hw/mlx5/main.c
drivers/infiniband/hw/mlx5/mlx5_ib.h
drivers/infiniband/hw/mlx5/odp.c
drivers/infiniband/hw/mlx5/qp.c
drivers/infiniband/hw/mlx5/srq.c
drivers/infiniband/hw/mlx5/user.h
drivers/infiniband/hw/mthca/mthca_cq.c
drivers/infiniband/hw/mthca/mthca_provider.c
drivers/infiniband/hw/mthca/mthca_qp.c
drivers/infiniband/hw/nes/nes_cm.c
drivers/infiniband/hw/nes/nes_cm.h
drivers/infiniband/hw/nes/nes_utils.c
drivers/infiniband/hw/nes/nes_verbs.c
drivers/infiniband/hw/nes/nes_verbs.h
drivers/infiniband/hw/ocrdma/ocrdma_ah.c
drivers/infiniband/hw/ocrdma/ocrdma_main.c
drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
drivers/infiniband/hw/qib/qib_fs.c
drivers/infiniband/hw/qib/qib_mr.c
drivers/infiniband/hw/qib/qib_qp.c
drivers/infiniband/hw/qib/qib_verbs.c
drivers/infiniband/hw/qib/qib_verbs.h
drivers/infiniband/hw/qib/qib_verbs_mcast.c
drivers/infiniband/hw/usnic/usnic_debugfs.c
drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c
drivers/infiniband/hw/usnic/usnic_ib_verbs.c
drivers/infiniband/hw/usnic/usnic_ib_verbs.h
drivers/infiniband/hw/usnic/usnic_vnic.c
drivers/infiniband/ulp/ipoib/ipoib.h
drivers/infiniband/ulp/ipoib/ipoib_cm.c
drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
drivers/infiniband/ulp/ipoib/ipoib_main.c
drivers/infiniband/ulp/ipoib/ipoib_multicast.c
drivers/infiniband/ulp/iser/iscsi_iser.c
drivers/infiniband/ulp/iser/iscsi_iser.h
drivers/infiniband/ulp/iser/iser_initiator.c
drivers/infiniband/ulp/iser/iser_memory.c
drivers/infiniband/ulp/iser/iser_verbs.c
drivers/infiniband/ulp/isert/ib_isert.c
drivers/infiniband/ulp/isert/ib_isert.h
drivers/infiniband/ulp/isert/isert_proto.h [deleted file]
drivers/infiniband/ulp/srp/ib_srp.c
drivers/infiniband/ulp/srp/ib_srp.h
drivers/infiniband/ulp/srpt/ib_srpt.c
drivers/infiniband/ulp/srpt/ib_srpt.h
drivers/input/joystick/xpad.c
drivers/input/keyboard/gpio_keys.c
drivers/input/touchscreen/atmel_mxt_ts.c
drivers/lightnvm/Makefile
drivers/lightnvm/core.c
drivers/lightnvm/gennvm.c
drivers/lightnvm/rrpc.c
drivers/lightnvm/rrpc.h
drivers/lightnvm/sysblk.c [new file with mode: 0644]
drivers/md/bcache/btree.c
drivers/md/bcache/super.c
drivers/md/bcache/writeback.c
drivers/md/bcache/writeback.h
drivers/mmc/core/debugfs.c
drivers/mmc/core/pwrseq_simple.c
drivers/mmc/core/sd.c
drivers/mmc/core/sdio.c
drivers/mmc/core/sdio_cis.c
drivers/mmc/host/mmci.c
drivers/mmc/host/tmio_mmc_dma.c
drivers/mtd/ubi/cdev.c
drivers/net/ethernet/mellanox/mlx4/fw.c
drivers/net/ethernet/mellanox/mlx4/mlx4.h
drivers/net/ethernet/mellanox/mlx4/port.c
drivers/net/ethernet/mellanox/mlx4/qp.c
drivers/net/ethernet/mellanox/mlx5/core/en.h
drivers/net/ethernet/mellanox/mlx5/core/en_main.c
drivers/net/ethernet/mellanox/mlx5/core/eq.c
drivers/net/ethernet/mellanox/mlx5/core/main.c
drivers/net/ethernet/mellanox/mlx5/core/qp.c
drivers/net/ethernet/mellanox/mlx5/core/srq.c
drivers/net/ethernet/mellanox/mlx5/core/transobj.c
drivers/net/ethernet/mellanox/mlx5/core/transobj.h [deleted file]
drivers/net/ethernet/mellanox/mlx5/core/vport.c
drivers/ntb/hw/Kconfig
drivers/ntb/hw/Makefile
drivers/ntb/hw/amd/Kconfig [new file with mode: 0644]
drivers/ntb/hw/amd/Makefile [new file with mode: 0644]
drivers/ntb/hw/amd/ntb_hw_amd.c [new file with mode: 0644]
drivers/ntb/hw/amd/ntb_hw_amd.h [new file with mode: 0644]
drivers/ntb/hw/intel/ntb_hw_intel.c
drivers/ntb/hw/intel/ntb_hw_intel.h
drivers/ntb/ntb_transport.c
drivers/ntb/test/Kconfig
drivers/ntb/test/Makefile
drivers/ntb/test/ntb_perf.c [new file with mode: 0644]
drivers/nvme/host/Kconfig
drivers/nvme/host/Makefile
drivers/nvme/host/core.c [new file with mode: 0644]
drivers/nvme/host/lightnvm.c
drivers/nvme/host/nvme.h
drivers/nvme/host/pci.c
drivers/nvme/host/scsi.c
drivers/oprofile/oprofilefs.c
drivers/scsi/3w-xxxx.c
drivers/scsi/Kconfig
drivers/scsi/NCR5380.c
drivers/scsi/NCR5380.h
drivers/scsi/arm/cumana_1.c
drivers/scsi/arm/oak.c
drivers/scsi/atari_NCR5380.c
drivers/scsi/atari_scsi.c
drivers/scsi/be2iscsi/Kconfig
drivers/scsi/be2iscsi/be.h
drivers/scsi/be2iscsi/be_iscsi.c
drivers/scsi/be2iscsi/be_main.c
drivers/scsi/cxgbi/cxgb3i/cxgb3i.c
drivers/scsi/dmx3191d.c
drivers/scsi/dtc.c
drivers/scsi/dtc.h
drivers/scsi/g_NCR5380.c
drivers/scsi/g_NCR5380.h
drivers/scsi/hisi_sas/hisi_sas_v1_hw.c
drivers/scsi/imm.c
drivers/scsi/ipr.c
drivers/scsi/ipr.h
drivers/scsi/mac_scsi.c
drivers/scsi/megaraid/megaraid_mm.c
drivers/scsi/pas16.c
drivers/scsi/pas16.h
drivers/scsi/scsi_devinfo.c
drivers/scsi/storvsc_drv.c
drivers/scsi/sun3_scsi.c
drivers/scsi/t128.c
drivers/scsi/t128.h
drivers/soc/Kconfig
drivers/soc/qcom/spm.c
drivers/soc/tegra/Kconfig [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/libcfs_private.h
drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c
drivers/staging/lustre/lustre/llite/dir.c
drivers/staging/lustre/lustre/llite/file.c
drivers/staging/lustre/lustre/llite/llite_internal.h
drivers/staging/lustre/lustre/llite/llite_lib.c
drivers/staging/lustre/lustre/llite/llite_nfs.c
drivers/staging/lustre/lustre/llite/lloop.c
drivers/staging/lustre/lustre/llite/rw.c
drivers/staging/lustre/lustre/llite/rw26.c
drivers/staging/lustre/lustre/llite/vvp_io.c
drivers/staging/lustre/lustre/llite/vvp_page.c
drivers/staging/rdma/amso1100/c2_cq.c
drivers/staging/rdma/amso1100/c2_provider.c
drivers/staging/rdma/ehca/ehca_classes.h
drivers/staging/rdma/ehca/ehca_iverbs.h
drivers/staging/rdma/ehca/ehca_main.c
drivers/staging/rdma/ehca/ehca_mrmw.c
drivers/staging/rdma/ehca/ehca_mrmw.h
drivers/staging/rdma/ehca/ehca_reqs.c
drivers/staging/rdma/hfi1/mr.c
drivers/staging/rdma/hfi1/verbs.c
drivers/staging/rdma/hfi1/verbs.h
drivers/staging/rdma/ipath/ipath_fs.c
drivers/staging/rdma/ipath/ipath_mr.c
drivers/staging/rdma/ipath/ipath_verbs.c
drivers/staging/rdma/ipath/ipath_verbs.h
drivers/target/target_core_iblock.c
drivers/thermal/int340x_thermal/processor_thermal_device.c
drivers/thermal/intel_pch_thermal.c
drivers/thermal/rcar_thermal.c
drivers/thermal/rockchip_thermal.c
drivers/thermal/step_wise.c
drivers/thermal/thermal_core.c
drivers/thermal/thermal_core.h
drivers/usb/gadget/function/f_printer.c
drivers/usb/gadget/legacy/inode.c
drivers/usb/gadget/udc/atmel_usba_udc.c
drivers/usb/host/Kconfig
drivers/video/fbdev/core/fb_defio.c
fs/9p/vfs_file.c
fs/affs/file.c
fs/afs/flock.c
fs/afs/write.c
fs/attr.c
fs/binfmt_misc.c
fs/block_dev.c
fs/btrfs/backref.c
fs/btrfs/ctree.h
fs/btrfs/dev-replace.c
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/extent_map.c
fs/btrfs/extent_map.h
fs/btrfs/file.c
fs/btrfs/inode-map.c
fs/btrfs/inode-map.h
fs/btrfs/inode.c
fs/btrfs/ioctl.c
fs/btrfs/raid56.c
fs/btrfs/relocation.c
fs/btrfs/scrub.c
fs/btrfs/super.c
fs/btrfs/volumes.c
fs/btrfs/xattr.c
fs/cachefiles/interface.c
fs/cachefiles/namei.c
fs/ceph/addr.c
fs/ceph/cache.c
fs/ceph/caps.c
fs/ceph/dir.c
fs/ceph/export.c
fs/ceph/file.c
fs/ceph/inode.c
fs/cifs/cifs_debug.c
fs/cifs/cifs_debug.h
fs/cifs/cifsfs.c
fs/cifs/cifsglob.h
fs/cifs/cifsproto.h
fs/cifs/connect.c
fs/cifs/file.c
fs/cifs/inode.c
fs/cifs/misc.c
fs/cifs/readdir.c
fs/cifs/smb2misc.c
fs/cifs/smb2ops.c
fs/cifs/smb2pdu.c
fs/cifs/smb2pdu.h
fs/cifs/smb2proto.h
fs/cifs/smb2transport.c
fs/cifs/transport.c
fs/coda/coda_linux.h
fs/coda/dir.c
fs/coda/file.c
fs/configfs/dir.c
fs/configfs/file.c
fs/configfs/inode.c
fs/dax.c
fs/dcache.c
fs/debugfs/inode.c
fs/devpts/inode.c
fs/direct-io.c
fs/dlm/user.c
fs/ecryptfs/inode.c
fs/ecryptfs/mmap.c
fs/efivarfs/file.c
fs/efivarfs/super.c
fs/exec.c
fs/exofs/file.c
fs/exportfs/expfs.c
fs/ext2/file.c
fs/ext2/ioctl.c
fs/ext4/crypto.c
fs/ext4/crypto_key.c
fs/ext4/ext4.h
fs/ext4/extents.c
fs/ext4/file.c
fs/ext4/ialloc.c
fs/ext4/inline.c
fs/ext4/inode.c
fs/ext4/ioctl.c
fs/ext4/namei.c
fs/ext4/super.c
fs/ext4/truncate.h
fs/f2fs/data.c
fs/f2fs/file.c
fs/fat/dir.c
fs/fat/file.c
fs/filesystems.c
fs/fuse/dir.c
fs/fuse/file.c
fs/gfs2/file.c
fs/gfs2/inode.c
fs/gfs2/quota.c
fs/hfs/dir.c
fs/hfs/inode.c
fs/hfsplus/dir.c
fs/hfsplus/inode.c
fs/hfsplus/ioctl.c
fs/hostfs/hostfs_kern.c
fs/hpfs/dir.c
fs/hugetlbfs/inode.c
fs/inode.c
fs/ioctl.c
fs/jffs2/build.c
fs/jffs2/file.c
fs/jffs2/fs.c
fs/jffs2/super.c
fs/jfs/file.c
fs/jfs/ioctl.c
fs/jfs/super.c
fs/kernfs/dir.c
fs/libfs.c
fs/locks.c
fs/logfs/file.c
fs/namei.c
fs/namespace.c
fs/ncpfs/dir.c
fs/ncpfs/file.c
fs/nfs/dir.c
fs/nfs/direct.c
fs/nfs/file.c
fs/nfs/filelayout/filelayout.c
fs/nfs/flexfilelayout/flexfilelayout.c
fs/nfs/flexfilelayout/flexfilelayoutdev.c
fs/nfs/inode.c
fs/nfs/internal.h
fs/nfs/nfs42proc.c
fs/nfs/nfs4file.c
fs/nfs/write.c
fs/nfsd/nfs4proc.c
fs/nfsd/nfs4recover.c
fs/nfsd/nfsfh.h
fs/nfsd/vfs.c
fs/nilfs2/inode.c
fs/nilfs2/ioctl.c
fs/ntfs/dir.c
fs/ntfs/file.c
fs/ntfs/quota.c
fs/ntfs/super.c
fs/ocfs2/alloc.c
fs/ocfs2/aops.c
fs/ocfs2/dir.c
fs/ocfs2/dlmglue.c
fs/ocfs2/file.c
fs/ocfs2/inode.c
fs/ocfs2/ioctl.c
fs/ocfs2/journal.c
fs/ocfs2/localalloc.c
fs/ocfs2/move_extents.c
fs/ocfs2/namei.c
fs/ocfs2/quota_global.c
fs/ocfs2/refcounttree.c
fs/ocfs2/resize.c
fs/ocfs2/suballoc.c
fs/ocfs2/xattr.c
fs/open.c
fs/overlayfs/copy_up.c
fs/overlayfs/dir.c
fs/overlayfs/inode.c
fs/overlayfs/readdir.c
fs/overlayfs/super.c
fs/pipe.c
fs/proc/kcore.c
fs/proc/self.c
fs/proc/task_mmu.c
fs/proc/thread_self.c
fs/pstore/inode.c
fs/quota/dquot.c
fs/read_write.c
fs/readdir.c
fs/reiserfs/dir.c
fs/reiserfs/file.c
fs/reiserfs/ioctl.c
fs/reiserfs/super.c
fs/reiserfs/xattr.c
fs/tracefs/inode.c
fs/ubifs/dir.c
fs/ubifs/file.c
fs/ubifs/xattr.c
fs/udf/file.c
fs/udf/inode.c
fs/udf/super.c
fs/utimes.c
fs/xattr.c
fs/xfs/libxfs/xfs_format.h
fs/xfs/libxfs/xfs_fs.h
fs/xfs/xfs_buf.c
fs/xfs/xfs_file.c
fs/xfs/xfs_inode.c
fs/xfs/xfs_ioctl.c
fs/xfs/xfs_iops.c
fs/xfs/xfs_pnfs.c
fs/xfs/xfs_trans_ail.c
include/crypto/hash.h
include/crypto/if_alg.h
include/crypto/skcipher.h
include/linux/aer.h
include/linux/bio.h
include/linux/blk-iopoll.h [deleted file]
include/linux/blk_types.h
include/linux/blkdev.h
include/linux/ceph/ceph_frag.h
include/linux/ceph/messenger.h
include/linux/dax.h
include/linux/drbd.h
include/linux/drbd_genl.h
include/linux/fs.h
include/linux/huge_mm.h
include/linux/idr.h
include/linux/interrupt.h
include/linux/irq_poll.h [new file with mode: 0644]
include/linux/lightnvm.h
include/linux/lru_cache.h
include/linux/mlx4/cmd.h
include/linux/mlx4/device.h
include/linux/mlx4/qp.h
include/linux/mlx5/device.h
include/linux/mlx5/driver.h
include/linux/mlx5/mlx5_ifc.h
include/linux/mlx5/qp.h
include/linux/mlx5/transobj.h [new file with mode: 0644]
include/linux/mlx5/vport.h
include/linux/nvme.h
include/linux/pagemap.h
include/linux/pipe_fs_i.h
include/linux/pmem.h
include/linux/radix-tree.h
include/linux/sched.h
include/linux/shmem_fs.h
include/linux/sunrpc/svc_rdma.h
include/linux/thermal.h
include/rdma/ib_addr.h
include/rdma/ib_cache.h
include/rdma/ib_mad.h
include/rdma/ib_pack.h
include/rdma/ib_pma.h
include/rdma/ib_sa.h
include/rdma/ib_verbs.h
include/scsi/iser.h [new file with mode: 0644]
include/sound/timer.h
include/trace/events/ext4.h
include/trace/events/huge_memory.h
include/trace/events/irq.h
include/uapi/linux/Kbuild
include/uapi/linux/fs.h
include/uapi/linux/lightnvm.h
include/xen/interface/io/blkif.h
ipc/mqueue.c
ipc/sem.c
ipc/util.c
ipc/util.h
kernel/audit_fsnotify.c
kernel/audit_watch.c
kernel/events/core.c
kernel/relay.c
kernel/sched/core.c
kernel/sysctl.c
lib/Kconfig
lib/Makefile
lib/irq_poll.c [new file with mode: 0644]
lib/libcrc32c.c
lib/lru_cache.c
lib/ratelimit.c
mm/filemap.c
mm/huge_memory.c
mm/memcontrol.c
mm/mincore.c
mm/mlock.c
mm/percpu.c
mm/shmem.c
mm/swapfile.c
mm/truncate.c
mm/vmscan.c
mm/vmstat.c
mm/workingset.c
net/9p/trans_fd.c
net/9p/trans_virtio.c
net/ceph/auth_x.c
net/ceph/auth_x.h
net/ceph/messenger.c
net/ceph/mon_client.c
net/ipv4/fib_trie.c
net/rds/ib.c
net/rds/iw.c
net/sunrpc/cache.c
net/sunrpc/rpc_pipe.c
net/sunrpc/xprt.c
net/sunrpc/xprtrdma/Makefile
net/sunrpc/xprtrdma/frwr_ops.c
net/sunrpc/xprtrdma/svc_rdma.c
net/sunrpc/xprtrdma/svc_rdma_backchannel.c [new file with mode: 0644]
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
net/sunrpc/xprtrdma/svc_rdma_sendto.c
net/sunrpc/xprtrdma/svc_rdma_transport.c
net/sunrpc/xprtrdma/transport.c
net/sunrpc/xprtrdma/verbs.c
net/sunrpc/xprtrdma/xprt_rdma.h
security/inode.c
security/integrity/ima/ima_main.c
security/selinux/selinuxfs.c
sound/core/control.c
sound/core/hrtimer.c
sound/core/pcm_compat.c
sound/core/seq/seq_compat.c
sound/core/timer.c
sound/hda/hdac_i915.c
sound/pci/hda/hda_bind.c
sound/pci/hda/hda_intel.c
sound/pci/hda/patch_realtek.c
sound/spi/at73c213.c
tools/lib/traceevent/event-parse.c
tools/perf/util/trace-event-parse.c

diff --git a/Documentation/ABI/testing/configfs-rdma_cm b/Documentation/ABI/testing/configfs-rdma_cm
new file mode 100644 (file)
index 0000000..5c389aa
--- /dev/null
@@ -0,0 +1,22 @@
+What:          /config/rdma_cm
+Date:          November 29, 2015
+KernelVersion:  4.4.0
+Description:   Interface is used to configure RDMA-cable HCAs in respect to
+               RDMA-CM attributes.
+
+               Attributes are visible only when configfs is mounted. To mount
+               configfs in /config directory use:
+               # mount -t configfs none /config/
+
+               In order to set parameters related to a specific HCA, a directory
+               for this HCA has to be created:
+               mkdir -p /config/rdma_cm/<hca>
+
+
+What:          /config/rdma_cm/<hca>/ports/<port-num>/default_roce_mode
+Date:          November 29, 2015
+KernelVersion:  4.4.0
+Description:   RDMA-CM based connections from HCA <hca> at port <port-num>
+               will be initiated with this RoCE type as default.
+               The possible RoCE types are either "IB/RoCE v1" or "RoCE v2".
+               This parameter has RW access.
diff --git a/Documentation/ABI/testing/sysfs-class-infiniband b/Documentation/ABI/testing/sysfs-class-infiniband
new file mode 100644 (file)
index 0000000..a86abe6
--- /dev/null
@@ -0,0 +1,16 @@
+What:          /sys/class/infiniband/<hca>/ports/<port-number>/gid_attrs/ndevs/<gid-index>
+Date:          November 29, 2015
+KernelVersion: 4.4.0
+Contact:       linux-rdma@vger.kernel.org
+Description:   The net-device's name associated with the GID resides
+               at index <gid-index>.
+
+What:          /sys/class/infiniband/<hca>/ports/<port-number>/gid_attrs/types/<gid-index>
+Date:          November 29, 2015
+KernelVersion: 4.4.0
+Contact:       linux-rdma@vger.kernel.org
+Description:   The RoCE type of the associated GID resides at index <gid-index>.
+               This could either be "IB/RoCE v1" for IB and RoCE v1 based GODs
+               or "RoCE v2" for RoCE v2 based GIDs.
+
+
index cf1333d1dd52974e4e960a7c97418f86864e56f1..21641236c095dfc257b0d5375ef0e8710129e690 100644 (file)
@@ -6,6 +6,7 @@ Required properties:
 Optional properties:
        - autorepeat: Boolean, Enable auto repeat feature of Linux input
          subsystem.
+       - label: String, name of the input device.
 
 Each button (key) is represented as a sub-node of "gpio-keys":
 Subnode properties:
index 0dfa60d88dd3b4bb4ce2a42d2c0b3d15f4835e45..08efe6bc21935fa36785612043489faab5ba33c7 100644 (file)
@@ -2,8 +2,10 @@
 
 Required properties:
 - compatible : should be "rockchip,<name>-tsadc"
+   "rockchip,rk3228-tsadc": found on RK3228 SoCs
    "rockchip,rk3288-tsadc": found on RK3288 SoCs
    "rockchip,rk3368-tsadc": found on RK3368 SoCs
+   "rockchip,rk3399-tsadc": found on RK3399 SoCs
 - reg : physical base address of the controller and length of memory mapped
        region.
 - interrupts : The interrupt number to the cpu. The interrupt specifier format
index e1678542279a2d2ed8b3dd3222db81b267f7c2d4..4b1f36b6ada034000d3d8e80831f79b4921f0ab4 100644 (file)
@@ -15,7 +15,6 @@ Sleeping and interrupt context
     modify_ah
     query_ah
     destroy_ah
-    bind_mw
     post_send
     post_recv
     poll_cq
@@ -31,7 +30,6 @@ Sleeping and interrupt context
     ib_modify_ah
     ib_query_ah
     ib_destroy_ah
-    ib_bind_mw
     ib_post_send
     ib_post_recv
     ib_req_notify_cq
index f4cbfe0ba1085b4df3067dcc457219699c5c6150..edec3a3e648d12eee550f953b6dec87ab58e34f8 100644 (file)
@@ -90,7 +90,7 @@ BLOCK_SOFTIRQ:  Do all of the following:
        from being initiated from tasks that might run on the CPU to
        be de-jittered.  (It is OK to force this CPU offline and then
        bring it back online before you start your application.)
-BLOCK_IOPOLL_SOFTIRQ:  Do all of the following:
+IRQ_POLL_SOFTIRQ:  Do all of the following:
 1.     Force block-device interrupts onto some other CPU.
 2.     Initiate any block I/O and block-I/O polling on other CPUs.
 3.     Once your application has started, prevent CPU-hotplug operations
index 88152f214f48cb69c643d4bf2ff2ac9a61ad2eb0..302b5ed616a6b2a5360e88e519140284b2f0e0d0 100644 (file)
@@ -32,6 +32,8 @@ Currently, these files are in /proc/sys/fs:
 - nr_open
 - overflowuid
 - overflowgid
+- pipe-user-pages-hard
+- pipe-user-pages-soft
 - protected_hardlinks
 - protected_symlinks
 - suid_dumpable
@@ -159,6 +161,27 @@ The default is 65534.
 
 ==============================================================
 
+pipe-user-pages-hard:
+
+Maximum total number of pages a non-privileged user may allocate for pipes.
+Once this limit is reached, no new pipes may be allocated until usage goes
+below the limit again. When set to 0, no limit is applied, which is the default
+setting.
+
+==============================================================
+
+pipe-user-pages-soft:
+
+Maximum total number of pages a non-privileged user may allocate for pipes
+before the pipe size gets limited to a single page. Once this limit is reached,
+new pipes will be limited to a single page in size for this user in order to
+limit total memory usage, and trying to increase them using fcntl() will be
+denied until usage goes below the limit again. The default value allows to
+allocate up to 1024 pipes at their default size. When set to 0, no limit is
+applied.
+
+==============================================================
+
 protected_hardlinks:
 
 A long-standing class of security issues is the hardlink-based
index 4ad9d7c46cac2df28e024d7beeebdb77263bf7c8..78f71f471bd6df1fb3ca31ff6e32240f4dbdb90a 100644 (file)
@@ -3665,13 +3665,12 @@ F:      drivers/scsi/dpt*
 F:     drivers/scsi/dpt/
 
 DRBD DRIVER
-P:     Philipp Reisner
-P:     Lars Ellenberg
-M:     drbd-dev@lists.linbit.com
-L:     drbd-user@lists.linbit.com
+M:     Philipp Reisner <philipp.reisner@linbit.com>
+M:     Lars Ellenberg <lars.ellenberg@linbit.com>
+L:     drbd-dev@lists.linbit.com
 W:     http://www.drbd.org
-T:     git git://git.drbd.org/linux-2.6-drbd.git drbd
-T:     git git://git.drbd.org/drbd-8.3.git
+T:     git git://git.linbit.com/linux-drbd.git
+T:     git git://git.linbit.com/drbd-8.4.git
 S:     Supported
 F:     drivers/block/drbd/
 F:     lib/lru_cache.c
@@ -7150,27 +7149,45 @@ W:      https://linuxtv.org
 S:     Odd Fixes
 F:     drivers/media/radio/radio-miropcm20*
 
-Mellanox MLX5 core VPI driver
-M:     Eli Cohen <eli@mellanox.com>
+MELLANOX MLX4 core VPI driver
+M:     Yishai Hadas <yishaih@mellanox.com>
 L:     netdev@vger.kernel.org
 L:     linux-rdma@vger.kernel.org
 W:     http://www.mellanox.com
 Q:     http://patchwork.ozlabs.org/project/netdev/list/
+S:     Supported
+F:     drivers/net/ethernet/mellanox/mlx4/
+F:     include/linux/mlx4/
+
+MELLANOX MLX4 IB driver
+M:     Yishai Hadas <yishaih@mellanox.com>
+L:     linux-rdma@vger.kernel.org
+W:     http://www.mellanox.com
 Q:     http://patchwork.kernel.org/project/linux-rdma/list/
-T:     git git://openfabrics.org/~eli/connect-ib.git
+S:     Supported
+F:     drivers/infiniband/hw/mlx4/
+F:     include/linux/mlx4/
+
+MELLANOX MLX5 core VPI driver
+M:     Matan Barak <matanb@mellanox.com>
+M:     Leon Romanovsky <leonro@mellanox.com>
+L:     netdev@vger.kernel.org
+L:     linux-rdma@vger.kernel.org
+W:     http://www.mellanox.com
+Q:     http://patchwork.ozlabs.org/project/netdev/list/
 S:     Supported
 F:     drivers/net/ethernet/mellanox/mlx5/core/
 F:     include/linux/mlx5/
 
-Mellanox MLX5 IB driver
-M:     Eli Cohen <eli@mellanox.com>
+MELLANOX MLX5 IB driver
+M:     Matan Barak <matanb@mellanox.com>
+M:     Leon Romanovsky <leonro@mellanox.com>
 L:     linux-rdma@vger.kernel.org
 W:     http://www.mellanox.com
 Q:     http://patchwork.kernel.org/project/linux-rdma/list/
-T:     git git://openfabrics.org/~eli/connect-ib.git
 S:     Supported
-F:     include/linux/mlx5/
 F:     drivers/infiniband/hw/mlx5/
+F:     include/linux/mlx5/
 
 MELEXIS MLX90614 DRIVER
 M:     Crt Mori <cmo@melexis.com>
@@ -7701,6 +7718,12 @@ W:       https://github.com/jonmason/ntb/wiki
 T:     git git://github.com/jonmason/ntb.git
 F:     drivers/ntb/hw/intel/
 
+NTB AMD DRIVER
+M:     Xiangliang Yu <Xiangliang.Yu@amd.com>
+L:     linux-ntb@googlegroups.com
+S:     Supported
+F:     drivers/ntb/hw/amd/
+
 NTFS FILESYSTEM
 M:     Anton Altaparmakov <anton@tuxera.com>
 L:     linux-ntfs-dev@lists.sourceforge.net
@@ -10452,9 +10475,11 @@ S:     Maintained
 F:     drivers/net/ethernet/dlink/sundance.c
 
 SUPERH
+M:     Yoshinori Sato <ysato@users.sourceforge.jp>
+M:     Rich Felker <dalias@libc.org>
 L:     linux-sh@vger.kernel.org
 Q:     http://patchwork.kernel.org/project/linux-sh/list/
-S:     Orphan
+S:     Maintained
 F:     Documentation/sh/
 F:     arch/sh/
 F:     drivers/sh/
index 5c0e5cc8ed1026373b0f34a97c95bbfc49e4f5c5..c6b6175d020329ac74eeefb0eebf1a6c353d6ea8 100644 (file)
@@ -153,10 +153,9 @@ choice
                  mobile SoCs in the Kona family of chips (e.g. bcm28155,
                  bcm11351, etc...)
 
-       config DEBUG_BCM63XX
+       config DEBUG_BCM63XX_UART
                bool "Kernel low-level debugging on BCM63XX UART"
                depends on ARCH_BCM_63XX
-               select DEBUG_UART_BCM63XX
 
        config DEBUG_BERLIN_UART
                bool "Marvell Berlin SoC Debug UART"
@@ -1414,7 +1413,7 @@ config DEBUG_LL_INCLUDE
        default "debug/vf.S" if DEBUG_VF_UART
        default "debug/vt8500.S" if DEBUG_VT8500_UART0
        default "debug/zynq.S" if DEBUG_ZYNQ_UART0 || DEBUG_ZYNQ_UART1
-       default "debug/bcm63xx.S" if DEBUG_UART_BCM63XX
+       default "debug/bcm63xx.S" if DEBUG_BCM63XX_UART
        default "debug/digicolor.S" if DEBUG_DIGICOLOR_UA0
        default "mach/debug-macro.S"
 
@@ -1428,10 +1427,6 @@ config DEBUG_UART_8250
                ARCH_IOP13XX || ARCH_IOP32X || ARCH_IOP33X || ARCH_IXP4XX || \
                ARCH_RPC
 
-# Compatibility options for BCM63xx
-config DEBUG_UART_BCM63XX
-       def_bool ARCH_BCM_63XX
-
 config DEBUG_UART_PHYS
        hex "Physical base address of debug UART"
        default 0x00100a00 if DEBUG_NETX_UART
@@ -1529,7 +1524,7 @@ config DEBUG_UART_PHYS
        default 0xfffb0000 if DEBUG_OMAP1UART1 || DEBUG_OMAP7XXUART1
        default 0xfffb0800 if DEBUG_OMAP1UART2 || DEBUG_OMAP7XXUART2
        default 0xfffb9800 if DEBUG_OMAP1UART3 || DEBUG_OMAP7XXUART3
-       default 0xfffe8600 if DEBUG_UART_BCM63XX
+       default 0xfffe8600 if DEBUG_BCM63XX_UART
        default 0xfffff700 if ARCH_IOP33X
        depends on ARCH_EP93XX || \
                DEBUG_LL_UART_8250 || DEBUG_LL_UART_PL01X || \
@@ -1542,7 +1537,7 @@ config DEBUG_UART_PHYS
                DEBUG_RMOBILE_SCIFA0 || DEBUG_RMOBILE_SCIFA1 || \
                DEBUG_RMOBILE_SCIFA4 || DEBUG_S3C24XX_UART || \
                DEBUG_S3C64XX_UART || \
-               DEBUG_UART_BCM63XX || DEBUG_ASM9260_UART || \
+               DEBUG_BCM63XX_UART || DEBUG_ASM9260_UART || \
                DEBUG_SIRFSOC_UART || DEBUG_DIGICOLOR_UA0 || \
                DEBUG_AT91_UART
 
@@ -1588,7 +1583,7 @@ config DEBUG_UART_VIRT
        default 0xfb10c000 if DEBUG_REALVIEW_PB1176_PORT
        default 0xfc40ab00 if DEBUG_BRCMSTB_UART
        default 0xfc705000 if DEBUG_ZTE_ZX
-       default 0xfcfe8600 if DEBUG_UART_BCM63XX
+       default 0xfcfe8600 if DEBUG_BCM63XX_UART
        default 0xfd000000 if DEBUG_SPEAR3XX || DEBUG_SPEAR13XX
        default 0xfd012000 if DEBUG_MVEBU_UART0_ALTERNATE && ARCH_MV78XX0
        default 0xfd883000 if DEBUG_ALPINE_UART0
@@ -1638,7 +1633,7 @@ config DEBUG_UART_VIRT
                DEBUG_NETX_UART || \
                DEBUG_QCOM_UARTDM || DEBUG_S3C24XX_UART || \
                DEBUG_S3C64XX_UART || \
-               DEBUG_UART_BCM63XX || DEBUG_ASM9260_UART || \
+               DEBUG_BCM63XX_UART || DEBUG_ASM9260_UART || \
                DEBUG_SIRFSOC_UART || DEBUG_DIGICOLOR_UA0
 
 config DEBUG_UART_8250_SHIFT
index 78a21f2828dff6d1cbafa9139b12791d4840936d..c548cabb102f5ae1eb063a43117adedecf52b0b0 100644 (file)
 };
 
 &extal1_clk {
-       clock-frequency = <25000000>;
+       clock-frequency = <24000000>;
 };
 &extal2_clk {
        clock-frequency = <48000000>;
index 8be6632407d8f0dbb75f87c5878578dffba70fbb..dae8d86ef4ccc75c4bb3dfbb98aba9e7e12eec9f 100644 (file)
@@ -4,10 +4,9 @@
 ccflags-$(CONFIG_ARCH_MULTIPLATFORM) := -I$(srctree)/$(src)/include \
        -I$(srctree)/arch/arm/plat-versatile/include
 
-
+obj-y                                  := core.o
 obj-$(CONFIG_REALVIEW_DT)              += realview-dt.o
 obj-$(CONFIG_SMP)                      += platsmp-dt.o
-obj-y                                  := core.o
 
 ifdef CONFIG_ATAGS
 obj-$(CONFIG_MACH_REALVIEW_EB)         += realview_eb.o
index a90f3556017fe80ae3d2587e54e063cd619c0e6a..0fa8b84ed657e702a0f34f037927bceec78b8046 100644 (file)
@@ -13,57 +13,5 @@ menuconfig ARCH_TEGRA
        select ARCH_HAS_RESET_CONTROLLER
        select RESET_CONTROLLER
        select SOC_BUS
-       select USB_ULPI if USB_PHY
-       select USB_ULPI_VIEWPORT if USB_PHY
        help
          This enables support for NVIDIA Tegra based systems.
-
-if ARCH_TEGRA
-
-config ARCH_TEGRA_2x_SOC
-       bool "Enable support for Tegra20 family"
-       select ARCH_NEEDS_CPU_IDLE_COUPLED if SMP
-       select ARM_ERRATA_720789
-       select ARM_ERRATA_754327 if SMP
-       select ARM_ERRATA_764369 if SMP
-       select PINCTRL_TEGRA20
-       select PL310_ERRATA_727915 if CACHE_L2X0
-       select PL310_ERRATA_769419 if CACHE_L2X0
-       select TEGRA_TIMER
-       help
-         Support for NVIDIA Tegra AP20 and T20 processors, based on the
-         ARM CortexA9MP CPU and the ARM PL310 L2 cache controller
-
-config ARCH_TEGRA_3x_SOC
-       bool "Enable support for Tegra30 family"
-       select ARM_ERRATA_754322
-       select ARM_ERRATA_764369 if SMP
-       select PINCTRL_TEGRA30
-       select PL310_ERRATA_769419 if CACHE_L2X0
-       select TEGRA_TIMER
-       help
-         Support for NVIDIA Tegra T30 processor family, based on the
-         ARM CortexA9MP CPU and the ARM PL310 L2 cache controller
-
-config ARCH_TEGRA_114_SOC
-       bool "Enable support for Tegra114 family"
-       select ARM_ERRATA_798181 if SMP
-       select ARM_L1_CACHE_SHIFT_6
-       select HAVE_ARM_ARCH_TIMER
-       select PINCTRL_TEGRA114
-       select TEGRA_TIMER
-       help
-         Support for NVIDIA Tegra T114 processor family, based on the
-         ARM CortexA15MP CPU
-
-config ARCH_TEGRA_124_SOC
-       bool "Enable support for Tegra124 family"
-       select ARM_L1_CACHE_SHIFT_6
-       select HAVE_ARM_ARCH_TIMER
-       select PINCTRL_TEGRA124
-       select TEGRA_TIMER
-       help
-         Support for NVIDIA Tegra T124 processor family, based on the
-         ARM CortexA15MP CPU
-
-endif
index e6b684e14322cef21e44d1b660e4c6e8d0c7ec43..f5d19667484edd38e65bb1aa1d1a4413a036bfbf 100644 (file)
@@ -231,8 +231,11 @@ ENDPROC(tegra20_cpu_is_resettable_soon)
  * tegra20_tear_down_core in IRAM
  */
 ENTRY(tegra20_sleep_core_finish)
+       mov     r4, r0
        /* Flush, disable the L1 data cache and exit SMP */
+       mov     r0, #TEGRA_FLUSH_CACHE_ALL
        bl      tegra_disable_clean_inv_dcache
+       mov     r0, r4
 
        mov32   r3, tegra_shut_off_mmu
        add     r3, r3, r0
index 9a2f0b051e1035a4f956dbf98a90a9de493416e9..16e5ff03383cad5b709eb42b24340b033f86a2c7 100644 (file)
@@ -242,8 +242,11 @@ ENDPROC(tegra30_cpu_shutdown)
  * tegra30_tear_down_core in IRAM
  */
 ENTRY(tegra30_sleep_core_finish)
+       mov     r4, r0
        /* Flush, disable the L1 data cache and exit SMP */
+       mov     r0, #TEGRA_FLUSH_CACHE_ALL
        bl      tegra_disable_clean_inv_dcache
+       mov     r0, r4
 
        /*
         * Preload all the address literals that are needed for the
index 534a60ae282e702d3b06e8dad289d8f109cd80ef..0eca3812527e614e891113839992b4e20cf3fb8d 100644 (file)
@@ -1200,10 +1200,7 @@ error:
        while (i--)
                if (pages[i])
                        __free_pages(pages[i], 0);
-       if (array_size <= PAGE_SIZE)
-               kfree(pages);
-       else
-               vfree(pages);
+       kvfree(pages);
        return NULL;
 }
 
@@ -1211,7 +1208,6 @@ static int __iommu_free_buffer(struct device *dev, struct page **pages,
                               size_t size, struct dma_attrs *attrs)
 {
        int count = size >> PAGE_SHIFT;
-       int array_size = count * sizeof(struct page *);
        int i;
 
        if (dma_get_attr(DMA_ATTR_FORCE_CONTIGUOUS, attrs)) {
@@ -1222,10 +1218,7 @@ static int __iommu_free_buffer(struct device *dev, struct page **pages,
                                __free_pages(pages[i], 0);
        }
 
-       if (array_size <= PAGE_SIZE)
-               kfree(pages);
-       else
-               vfree(pages);
+       kvfree(pages);
        return 0;
 }
 
index 2c400412b3bce47ef97b7f7eb74b0018d88e7dab..21074f674bdeb707c137a9b87c2a65bde4bbb25a 100644 (file)
@@ -105,18 +105,6 @@ config ARCH_TEGRA
        help
          This enables support for the NVIDIA Tegra SoC family.
 
-config ARCH_TEGRA_132_SOC
-       bool "NVIDIA Tegra132 SoC"
-       depends on ARCH_TEGRA
-       select PINCTRL_TEGRA124
-       select USB_ULPI if USB_PHY
-       select USB_ULPI_VIEWPORT if USB_PHY
-       help
-         Enable support for NVIDIA Tegra132 SoC, based on the Denver
-         ARMv8 CPU.  The Tegra132 SoC is similar to the Tegra124 SoC,
-         but contains an NVIDIA Denver CPU complex in place of
-         Tegra124's "4+1" Cortex-A15 CPU complex.
-
 config ARCH_SPRD
        bool "Spreadtrum SoC platform"
        help
index 76e7510835b29322957d0d5b05654ce5fc6f4597..f832b8a7453a307b351273a83e320e413e27ce4a 100644 (file)
@@ -9,6 +9,7 @@ dts-dirs += freescale
 dts-dirs += hisilicon
 dts-dirs += marvell
 dts-dirs += mediatek
+dts-dirs += nvidia
 dts-dirs += qcom
 dts-dirs += renesas
 dts-dirs += rockchip
diff --git a/arch/arm64/boot/dts/nvidia/Makefile b/arch/arm64/boot/dts/nvidia/Makefile
new file mode 100644 (file)
index 0000000..a7e865d
--- /dev/null
@@ -0,0 +1,7 @@
+dtb-$(CONFIG_ARCH_TEGRA_132_SOC) += tegra132-norrin.dtb
+dtb-$(CONFIG_ARCH_TEGRA_210_SOC) += tegra210-p2371-0000.dtb
+dtb-$(CONFIG_ARCH_TEGRA_210_SOC) += tegra210-p2371-2180.dtb
+dtb-$(CONFIG_ARCH_TEGRA_210_SOC) += tegra210-p2571.dtb
+
+always         := $(dtb-y)
+clean-files    := *.dtb
diff --git a/arch/arm64/boot/dts/nvidia/tegra132-norrin.dts b/arch/arm64/boot/dts/nvidia/tegra132-norrin.dts
new file mode 100644 (file)
index 0000000..7dfe1c0
--- /dev/null
@@ -0,0 +1,1130 @@
+/dts-v1/;
+
+#include <dt-bindings/input/input.h>
+#include "tegra132.dtsi"
+
+/ {
+       model = "NVIDIA Tegra132 Norrin";
+       compatible = "nvidia,norrin", "nvidia,tegra132", "nvidia,tegra124";
+
+       aliases {
+               rtc0 = "/i2c@0,7000d000/as3722@40";
+               rtc1 = "/rtc@0,7000e000";
+       };
+
+       memory {
+               device_type = "memory";
+               reg = <0x0 0x80000000 0x0 0x80000000>;
+       };
+
+       host1x@0,50000000 {
+               hdmi@0,54280000 {
+                       status = "disabled";
+
+                       vdd-supply = <&vdd_3v3_hdmi>;
+                       pll-supply = <&vdd_hdmi_pll>;
+                       hdmi-supply = <&vdd_5v0_hdmi>;
+
+                       nvidia,ddc-i2c-bus = <&hdmi_ddc>;
+                       nvidia,hpd-gpio =
+                               <&gpio TEGRA_GPIO(N, 7) GPIO_ACTIVE_HIGH>;
+               };
+
+               sor@0,54540000 {
+                       status = "okay";
+
+                       nvidia,dpaux = <&dpaux>;
+                       nvidia,panel = <&panel>;
+               };
+
+               dpaux: dpaux@0,545c0000 {
+                       vdd-supply = <&vdd_3v3_panel>;
+                       status = "okay";
+               };
+       };
+
+       gpu@0,57000000 {
+               status = "okay";
+
+               vdd-supply = <&vdd_gpu>;
+       };
+
+       pinmux@0,70000868 {
+               pinctrl-names = "default";
+               pinctrl-0 = <&pinmux_default>;
+
+               pinmux_default: pinmux@0 {
+                       dap_mclk1_pw4 {
+                               nvidia,pins = "dap_mclk1_pw4";
+                               nvidia,function = "extperiph1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                       };
+                       dap2_din_pa4 {
+                               nvidia,pins = "dap2_din_pa4";
+                               nvidia,function = "i2s1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                       };
+                       dap2_dout_pa5 {
+                               nvidia,pins = "dap2_dout_pa5",
+                                             "dap2_fs_pa2",
+                                             "dap2_sclk_pa3";
+                               nvidia,function = "i2s1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                       };
+                       dap3_dout_pp2 {
+                               nvidia,pins = "dap3_dout_pp2";
+                               nvidia,function = "i2s2";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                       };
+                       dvfs_pwm_px0 {
+                               nvidia,pins = "dvfs_pwm_px0",
+                                             "dvfs_clk_px2";
+                               nvidia,function = "cldvfs";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                       };
+                       ulpi_clk_py0 {
+                               nvidia,pins = "ulpi_clk_py0",
+                                             "ulpi_nxt_py2",
+                                             "ulpi_stp_py3";
+                               nvidia,function = "spi1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                       };
+                       ulpi_dir_py1 {
+                               nvidia,pins = "ulpi_dir_py1";
+                               nvidia,function = "spi1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                       };
+                       cam_i2c_scl_pbb1 {
+                               nvidia,pins = "cam_i2c_scl_pbb1",
+                                             "cam_i2c_sda_pbb2";
+                               nvidia,function = "i2c3";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,lock = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_ENABLE>;
+                       };
+                       gen2_i2c_scl_pt5 {
+                               nvidia,pins = "gen2_i2c_scl_pt5",
+                                             "gen2_i2c_sda_pt6";
+                               nvidia,function = "i2c2";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,lock = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_ENABLE>;
+                       };
+                       pj7 {
+                               nvidia,pins = "pj7";
+                               nvidia,function = "uartd";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                       };
+                       spdif_in_pk6 {
+                               nvidia,pins = "spdif_in_pk6";
+                               nvidia,function = "spdif";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                       };
+                       pk7 {
+                               nvidia,pins = "pk7";
+                               nvidia,function = "uartd";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                       };
+                       pg4 {
+                               nvidia,pins = "pg4",
+                                             "pg5",
+                                             "pg6",
+                                             "pi3";
+                               nvidia,function = "spi4";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                       };
+                       pg7 {
+                               nvidia,pins = "pg7";
+                               nvidia,function = "spi4";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                       };
+                       ph1 {
+                               nvidia,pins = "ph1";
+                               nvidia,function = "pwm1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                       };
+                       pk0 {
+                               nvidia,pins = "pk0",
+                                             "kb_row15_ps7",
+                                             "clk_32k_out_pa0";
+                               nvidia,function = "soc";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                       };
+                       sdmmc1_clk_pz0 {
+                               nvidia,pins = "sdmmc1_clk_pz0";
+                               nvidia,function = "sdmmc1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                       };
+                       sdmmc1_cmd_pz1 {
+                               nvidia,pins = "sdmmc1_cmd_pz1",
+                                             "sdmmc1_dat0_py7",
+                                             "sdmmc1_dat1_py6",
+                                             "sdmmc1_dat2_py5",
+                                             "sdmmc1_dat3_py4";
+                               nvidia,function = "sdmmc1";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                       };
+                       sdmmc3_clk_pa6 {
+                               nvidia,pins = "sdmmc3_clk_pa6";
+                               nvidia,function = "sdmmc3";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                       };
+                       sdmmc3_cmd_pa7 {
+                               nvidia,pins = "sdmmc3_cmd_pa7",
+                                             "sdmmc3_dat0_pb7",
+                                             "sdmmc3_dat1_pb6",
+                                             "sdmmc3_dat2_pb5",
+                                             "sdmmc3_dat3_pb4",
+                                             "kb_col4_pq4",
+                                             "sdmmc3_clk_lb_out_pee4",
+                                             "sdmmc3_clk_lb_in_pee5",
+                                             "sdmmc3_cd_n_pv2";
+                               nvidia,function = "sdmmc3";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                       };
+                       sdmmc4_clk_pcc4 {
+                               nvidia,pins = "sdmmc4_clk_pcc4";
+                               nvidia,function = "sdmmc4";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                       };
+                       sdmmc4_cmd_pt7 {
+                               nvidia,pins = "sdmmc4_cmd_pt7",
+                                             "sdmmc4_dat0_paa0",
+                                             "sdmmc4_dat1_paa1",
+                                             "sdmmc4_dat2_paa2",
+                                             "sdmmc4_dat3_paa3",
+                                             "sdmmc4_dat4_paa4",
+                                             "sdmmc4_dat5_paa5",
+                                             "sdmmc4_dat6_paa6",
+                                             "sdmmc4_dat7_paa7";
+                               nvidia,function = "sdmmc4";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                       };
+                       mic_det_l {
+                               nvidia,pins = "kb_row7_pr7";
+                               nvidia,function = "rsvd2";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                       };
+                       kb_row10_ps2 {
+                               nvidia,pins = "kb_row10_ps2";
+                               nvidia,function = "uarta";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                       };
+                       kb_row9_ps1 {
+                               nvidia,pins = "kb_row9_ps1";
+                               nvidia,function = "uarta";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                       };
+                       pwr_i2c_scl_pz6 {
+                               nvidia,pins = "pwr_i2c_scl_pz6",
+                                             "pwr_i2c_sda_pz7";
+                               nvidia,function = "i2cpwr";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,lock = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_ENABLE>;
+                       };
+                       jtag_rtck {
+                               nvidia,pins = "jtag_rtck";
+                               nvidia,function = "rtck";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                       };
+                       clk_32k_in {
+                               nvidia,pins = "clk_32k_in";
+                               nvidia,function = "clk";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                       };
+                       core_pwr_req {
+                               nvidia,pins = "core_pwr_req";
+                               nvidia,function = "pwron";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                       };
+                       cpu_pwr_req {
+                               nvidia,pins = "cpu_pwr_req";
+                               nvidia,function = "cpu";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                       };
+                       kb_col0_ap {
+                               nvidia,pins = "kb_col0_pq0";
+                               nvidia,function = "rsvd4";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                       };
+                       en_vdd_sd {
+                               nvidia,pins = "kb_row0_pr0";
+                               nvidia,function = "rsvd4";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                       };
+                       lid_open {
+                               nvidia,pins = "kb_row4_pr4";
+                               nvidia,function = "rsvd3";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                       };
+                       pwr_int_n {
+                               nvidia,pins = "pwr_int_n";
+                               nvidia,function = "pmi";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                       };
+                       reset_out_n {
+                               nvidia,pins = "reset_out_n";
+                               nvidia,function = "reset_out_n";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                       };
+                       clk3_out_pee0 {
+                               nvidia,pins = "clk3_out_pee0";
+                               nvidia,function = "extperiph3";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                       };
+                       gen1_i2c_scl_pc4 {
+                               nvidia,pins = "gen1_i2c_scl_pc4",
+                                             "gen1_i2c_sda_pc5";
+                               nvidia,function = "i2c1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,lock = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_ENABLE>;
+                       };
+                       hdmi_cec_pee3 {
+                               nvidia,pins = "hdmi_cec_pee3";
+                               nvidia,function = "cec";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,lock = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       hdmi_int_pn7 {
+                               nvidia,pins = "hdmi_int_pn7";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                       };
+                       ddc_scl_pv4 {
+                               nvidia,pins = "ddc_scl_pv4",
+                                             "ddc_sda_pv5";
+                               nvidia,function = "i2c4";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,lock = <TEGRA_PIN_DISABLE>;
+                               nvidia,rcv-sel = <TEGRA_PIN_ENABLE>;
+                       };
+                       usb_vbus_en0_pn4 {
+                               nvidia,pins = "usb_vbus_en0_pn4",
+                                             "usb_vbus_en1_pn5",
+                                             "usb_vbus_en2_pff1";
+                               nvidia,function = "usb";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,lock = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       drive_sdio1 {
+                               nvidia,pins = "drive_sdio1";
+                               nvidia,high-speed-mode = <TEGRA_PIN_ENABLE>;
+                               nvidia,schmitt = <TEGRA_PIN_DISABLE>;
+                               nvidia,pull-down-strength = <36>;
+                               nvidia,pull-up-strength = <20>;
+                               nvidia,slew-rate-rising = <TEGRA_PIN_SLEW_RATE_SLOW>;
+                               nvidia,slew-rate-falling = <TEGRA_PIN_SLEW_RATE_SLOW>;
+                       };
+                       drive_sdio3 {
+                               nvidia,pins = "drive_sdio3";
+                               nvidia,high-speed-mode = <TEGRA_PIN_ENABLE>;
+                               nvidia,schmitt = <TEGRA_PIN_DISABLE>;
+                               nvidia,pull-down-strength = <22>;
+                               nvidia,pull-up-strength = <36>;
+                               nvidia,slew-rate-rising = <TEGRA_PIN_SLEW_RATE_FASTEST>;
+                               nvidia,slew-rate-falling = <TEGRA_PIN_SLEW_RATE_FASTEST>;
+                       };
+                       drive_gma {
+                               nvidia,pins = "drive_gma";
+                               nvidia,high-speed-mode = <TEGRA_PIN_ENABLE>;
+                               nvidia,schmitt = <TEGRA_PIN_DISABLE>;
+                               nvidia,pull-down-strength = <2>;
+                               nvidia,pull-up-strength = <1>;
+                               nvidia,slew-rate-rising = <TEGRA_PIN_SLEW_RATE_FASTEST>;
+                               nvidia,slew-rate-falling = <TEGRA_PIN_SLEW_RATE_FASTEST>;
+                               nvidia,drive-type = <1>;
+                       };
+                       ac_ok {
+                               nvidia,pins = "pj0";
+                               nvidia,function = "gmi";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                       };
+                       codec_irq_l {
+                               nvidia,pins = "ph4";
+                               nvidia,function = "gmi";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                       };
+                       lcd_bl_en {
+                               nvidia,pins = "ph2";
+                               nvidia,function = "gmi";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                       };
+                       touch_irq_l {
+                               nvidia,pins = "gpio_w3_aud_pw3";
+                               nvidia,function = "spi6";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                       };
+                       tpm_davint_l {
+                               nvidia,pins = "ph6";
+                               nvidia,function = "gmi";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                       };
+                       ts_irq_l {
+                               nvidia,pins = "pk2";
+                               nvidia,function = "gmi";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                       };
+                       ts_reset_l {
+                               nvidia,pins = "pk4";
+                               nvidia,function = "gmi";
+                               nvidia,pull = <1>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                       };
+                       ts_shdn_l {
+                               nvidia,pins = "pk1";
+                               nvidia,function = "gmi";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                       };
+                       ph7 {
+                               nvidia,pins = "ph7";
+                               nvidia,function = "gmi";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                       };
+                       sensor_irq_l {
+                               nvidia,pins = "pi6";
+                               nvidia,function = "gmi";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                       };
+                       wifi_en {
+                               nvidia,pins = "gpio_x7_aud_px7";
+                               nvidia,function = "rsvd4";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                       };
+                       chromeos_write_protect {
+                               nvidia,pins = "kb_row1_pr1";
+                               nvidia,function = "rsvd4";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                       };
+                       hp_det_l {
+                               nvidia,pins = "pi7";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                       };
+                       soc_warm_reset_l {
+                               nvidia,pins = "pi5";
+                               nvidia,function = "gmi";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                       };
+               };
+       };
+
+       serial@0,70006000 {
+               status = "okay";
+       };
+
+       pwm: pwm@0,7000a000 {
+               status = "okay";
+       };
+
+       /* HDMI DDC */
+       hdmi_ddc: i2c@0,7000c700 {
+               status = "okay";
+               clock-frequency = <100000>;
+       };
+
+       i2c@0,7000d000 {
+               status = "okay";
+               clock-frequency = <400000>;
+
+               as3722: pmic@40 {
+                       compatible = "ams,as3722";
+                       reg = <0x40>;
+                       interrupts = <GIC_SPI 86 IRQ_TYPE_LEVEL_HIGH>;
+
+                       ams,system-power-controller;
+
+                       #interrupt-cells = <2>;
+                       interrupt-controller;
+
+                       #gpio-cells = <2>;
+                       gpio-controller;
+
+                       pinctrl-names = "default";
+                       pinctrl-0 = <&as3722_default>;
+
+                       as3722_default: pinmux@0 {
+                               gpio0 {
+                                       pins = "gpio0";
+                                       function = "gpio";
+                                       bias-pull-down;
+                               };
+
+                               gpio1 {
+                                       pins = "gpio1";
+                                       function = "gpio";
+                                       bias-pull-up;
+                               };
+
+                               gpio2_4_7 {
+                                       pins = "gpio2", "gpio4", "gpio7";
+                                       function = "gpio";
+                                       bias-pull-up;
+                               };
+
+                               gpio3 {
+                                       pins = "gpio3";
+                                       function = "gpio";
+                                       bias-high-impedance;
+                               };
+
+                               gpio5 {
+                                       pins = "gpio5";
+                                       function = "clk32k-out";
+                                       bias-pull-down;
+                               };
+
+                               gpio6 {
+                                       pins = "gpio6";
+                                       function = "clk32k-out";
+                                       bias-pull-down;
+                               };
+                       };
+
+                       regulators {
+                               vsup-sd2-supply = <&vdd_5v0_sys>;
+                               vsup-sd3-supply = <&vdd_5v0_sys>;
+                               vsup-sd4-supply = <&vdd_5v0_sys>;
+                               vsup-sd5-supply = <&vdd_5v0_sys>;
+                               vin-ldo0-supply = <&vdd_1v35_lp0>;
+                               vin-ldo1-6-supply = <&vdd_3v3_sys>;
+                               vin-ldo2-5-7-supply = <&vddio_1v8>;
+                               vin-ldo3-4-supply = <&vdd_3v3_sys>;
+                               vin-ldo9-10-supply = <&vdd_5v0_sys>;
+                               vin-ldo11-supply = <&vdd_3v3_run>;
+
+                               sd0 {
+                                       regulator-name = "+VDD_CPU_AP";
+                                       regulator-min-microvolt = <700000>;
+                                       regulator-max-microvolt = <1350000>;
+                                       regulator-max-microamp = <3500000>;
+                                       regulator-always-on;
+                                       regulator-boot-on;
+                                       ams,ext-control = <2>;
+                               };
+
+                               sd1 {
+                                       regulator-name = "+VDD_CORE";
+                                       regulator-min-microvolt = <700000>;
+                                       regulator-max-microvolt = <1350000>;
+                                       regulator-max-microamp = <4000000>;
+                                       regulator-always-on;
+                                       regulator-boot-on;
+                                       ams,ext-control = <1>;
+                               };
+
+                               vdd_1v35_lp0: sd2 {
+                                       regulator-name = "+1.35V_LP0(sd2)";
+                                       regulator-min-microvolt = <1350000>;
+                                       regulator-max-microvolt = <1350000>;
+                                       regulator-always-on;
+                                       regulator-boot-on;
+                               };
+
+                               sd3 {
+                                       regulator-name = "+1.35V_LP0(sd3)";
+                                       regulator-min-microvolt = <1350000>;
+                                       regulator-max-microvolt = <1350000>;
+                                       regulator-always-on;
+                                       regulator-boot-on;
+                               };
+
+                               vdd_1v05_run: sd4 {
+                                       regulator-name = "+1.05V_RUN";
+                                       regulator-min-microvolt = <1050000>;
+                                       regulator-max-microvolt = <1050000>;
+                               };
+
+                               vddio_1v8: sd5 {
+                                       regulator-name = "+1.8V_VDDIO";
+                                       regulator-min-microvolt = <1800000>;
+                                       regulator-max-microvolt = <1800000>;
+                                       regulator-always-on;
+                                       regulator-boot-on;
+                               };
+
+                               vdd_gpu: sd6 {
+                                       regulator-name = "+VDD_GPU_AP";
+                                       regulator-min-microvolt = <800000>;
+                                       regulator-max-microvolt = <1200000>;
+                                       regulator-min-microamp = <3500000>;
+                                       regulator-max-microamp = <3500000>;
+                                       regulator-always-on;
+                                       regulator-boot-on;
+                               };
+
+                               ldo0 {
+                                       regulator-name = "+1.05_RUN_AVDD";
+                                       regulator-min-microvolt = <1050000>;
+                                       regulator-max-microvolt = <1050000>;
+                                       regulator-always-on;
+                                       regulator-boot-on;
+                                       ams,ext-control = <1>;
+                               };
+
+                               ldo1 {
+                                       regulator-name = "+1.8V_RUN_CAM";
+                                       regulator-min-microvolt = <1800000>;
+                                       regulator-max-microvolt = <1800000>;
+                               };
+
+                               ldo2 {
+                                       regulator-name = "+1.2V_GEN_AVDD";
+                                       regulator-min-microvolt = <1200000>;
+                                       regulator-max-microvolt = <1200000>;
+                                       regulator-always-on;
+                                       regulator-boot-on;
+                               };
+
+                               ldo3 {
+                                       regulator-name = "+1.00V_LP0_VDD_RTC";
+                                       regulator-min-microvolt = <1000000>;
+                                       regulator-max-microvolt = <1000000>;
+                                       regulator-always-on;
+                                       regulator-boot-on;
+                                       ams,enable-tracking;
+                               };
+
+                               vdd_run_cam: ldo4 {
+                                       regulator-name = "+2.8V_RUN_CAM";
+                                       regulator-min-microvolt = <2800000>;
+                                       regulator-max-microvolt = <2800000>;
+                               };
+
+                               ldo5 {
+                                       regulator-name = "+1.2V_RUN_CAM_FRONT";
+                                       regulator-min-microvolt = <1200000>;
+                                       regulator-max-microvolt = <1200000>;
+                               };
+
+                               vddio_sdmmc3: ldo6 {
+                                       regulator-name = "+VDDIO_SDMMC3";
+                                       regulator-min-microvolt = <1800000>;
+                                       regulator-max-microvolt = <3300000>;
+                               };
+
+                               ldo7 {
+                                       regulator-name = "+1.05V_RUN_CAM_REAR";
+                                       regulator-min-microvolt = <1050000>;
+                                       regulator-max-microvolt = <1050000>;
+                               };
+
+                               ldo9 {
+                                       regulator-name = "+2.8V_RUN_TOUCH";
+                                       regulator-min-microvolt = <2800000>;
+                                       regulator-max-microvolt = <2800000>;
+                               };
+
+                               ldo10 {
+                                       regulator-name = "+2.8V_RUN_CAM_AF";
+                                       regulator-min-microvolt = <2800000>;
+                                       regulator-max-microvolt = <2800000>;
+                               };
+
+                               ldo11 {
+                                       regulator-name = "+1.8V_RUN_VPP_FUSE";
+                                       regulator-min-microvolt = <1800000>;
+                                       regulator-max-microvolt = <1800000>;
+                               };
+                       };
+               };
+       };
+
+       spi@0,7000d400 {
+               status = "okay";
+
+               ec: cros-ec@0 {
+                       compatible = "google,cros-ec-spi";
+                       spi-max-frequency = <3000000>;
+                       interrupt-parent = <&gpio>;
+                       interrupts = <TEGRA_GPIO(C, 7) IRQ_TYPE_LEVEL_LOW>;
+                       reg = <0>;
+
+                       google,cros-ec-spi-msg-delay = <2000>;
+
+                       i2c_20: i2c-tunnel {
+                               compatible = "google,cros-ec-i2c-tunnel";
+                               #address-cells = <1>;
+                               #size-cells = <0>;
+
+                               google,remote-bus = <0>;
+
+                               charger: bq24735 {
+                                       compatible = "ti,bq24735";
+                                       reg = <0x9>;
+                                       interrupt-parent = <&gpio>;
+                                       interrupts = <TEGRA_GPIO(J, 0)
+                                                       GPIO_ACTIVE_HIGH>;
+                                       ti,ac-detect-gpios = <&gpio
+                                                       TEGRA_GPIO(J, 0)
+                                                       GPIO_ACTIVE_HIGH>;
+                               };
+
+                               battery: smart-battery {
+                                       compatible = "sbs,sbs-battery";
+                                       reg = <0xb>;
+                                       battery-name = "battery";
+                                       sbs,i2c-retry-count = <2>;
+                                       sbs,poll-retry-count = <10>;
+                               /*      power-supplies = <&charger>; */
+                               };
+                       };
+
+                       keyboard-controller {
+                               compatible = "google,cros-ec-keyb";
+                               keypad,num-rows = <8>;
+                               keypad,num-columns = <13>;
+                               google,needs-ghost-filter;
+                               linux,keymap =
+                                       <MATRIX_KEY(0x00, 0x01, KEY_LEFTMETA)
+                                        MATRIX_KEY(0x00, 0x02, KEY_F1)
+                                        MATRIX_KEY(0x00, 0x03, KEY_B)
+                                        MATRIX_KEY(0x00, 0x04, KEY_F10)
+                                        MATRIX_KEY(0x00, 0x06, KEY_N)
+                                        MATRIX_KEY(0x00, 0x08, KEY_EQUAL)
+                                        MATRIX_KEY(0x00, 0x0a, KEY_RIGHTALT)
+
+                                        MATRIX_KEY(0x01, 0x01, KEY_ESC)
+                                        MATRIX_KEY(0x01, 0x02, KEY_F4)
+                                        MATRIX_KEY(0x01, 0x03, KEY_G)
+                                        MATRIX_KEY(0x01, 0x04, KEY_F7)
+                                        MATRIX_KEY(0x01, 0x06, KEY_H)
+                                        MATRIX_KEY(0x01, 0x08, KEY_APOSTROPHE)
+                                        MATRIX_KEY(0x01, 0x09, KEY_F9)
+                                        MATRIX_KEY(0x01, 0x0b, KEY_BACKSPACE)
+
+                                        MATRIX_KEY(0x02, 0x00, KEY_LEFTCTRL)
+                                        MATRIX_KEY(0x02, 0x01, KEY_TAB)
+                                        MATRIX_KEY(0x02, 0x02, KEY_F3)
+                                        MATRIX_KEY(0x02, 0x03, KEY_T)
+                                        MATRIX_KEY(0x02, 0x04, KEY_F6)
+                                        MATRIX_KEY(0x02, 0x05, KEY_RIGHTBRACE)
+                                        MATRIX_KEY(0x02, 0x06, KEY_Y)
+                                        MATRIX_KEY(0x02, 0x07, KEY_102ND)
+                                        MATRIX_KEY(0x02, 0x08, KEY_LEFTBRACE)
+                                        MATRIX_KEY(0x02, 0x09, KEY_F8)
+
+                                        MATRIX_KEY(0x03, 0x01, KEY_GRAVE)
+                                        MATRIX_KEY(0x03, 0x02, KEY_F2)
+                                        MATRIX_KEY(0x03, 0x03, KEY_5)
+                                        MATRIX_KEY(0x03, 0x04, KEY_F5)
+                                        MATRIX_KEY(0x03, 0x06, KEY_6)
+                                        MATRIX_KEY(0x03, 0x08, KEY_MINUS)
+                                        MATRIX_KEY(0x03, 0x0b, KEY_BACKSLASH)
+
+                                        MATRIX_KEY(0x04, 0x00, KEY_RIGHTCTRL)
+                                        MATRIX_KEY(0x04, 0x01, KEY_A)
+                                        MATRIX_KEY(0x04, 0x02, KEY_D)
+                                        MATRIX_KEY(0x04, 0x03, KEY_F)
+                                        MATRIX_KEY(0x04, 0x04, KEY_S)
+                                        MATRIX_KEY(0x04, 0x05, KEY_K)
+                                        MATRIX_KEY(0x04, 0x06, KEY_J)
+                                        MATRIX_KEY(0x04, 0x08, KEY_SEMICOLON)
+                                        MATRIX_KEY(0x04, 0x09, KEY_L)
+                                        MATRIX_KEY(0x04, 0x0a, KEY_BACKSLASH)
+                                        MATRIX_KEY(0x04, 0x0b, KEY_ENTER)
+
+                                        MATRIX_KEY(0x05, 0x01, KEY_Z)
+                                        MATRIX_KEY(0x05, 0x02, KEY_C)
+                                        MATRIX_KEY(0x05, 0x03, KEY_V)
+                                        MATRIX_KEY(0x05, 0x04, KEY_X)
+                                        MATRIX_KEY(0x05, 0x05, KEY_COMMA)
+                                        MATRIX_KEY(0x05, 0x06, KEY_M)
+                                        MATRIX_KEY(0x05, 0x07, KEY_LEFTSHIFT)
+                                        MATRIX_KEY(0x05, 0x08, KEY_SLASH)
+                                        MATRIX_KEY(0x05, 0x09, KEY_DOT)
+                                        MATRIX_KEY(0x05, 0x0b, KEY_SPACE)
+
+                                        MATRIX_KEY(0x06, 0x01, KEY_1)
+                                        MATRIX_KEY(0x06, 0x02, KEY_3)
+                                        MATRIX_KEY(0x06, 0x03, KEY_4)
+                                        MATRIX_KEY(0x06, 0x04, KEY_2)
+                                        MATRIX_KEY(0x06, 0x05, KEY_8)
+                                        MATRIX_KEY(0x06, 0x06, KEY_7)
+                                        MATRIX_KEY(0x06, 0x08, KEY_0)
+                                        MATRIX_KEY(0x06, 0x09, KEY_9)
+                                        MATRIX_KEY(0x06, 0x0a, KEY_LEFTALT)
+                                        MATRIX_KEY(0x06, 0x0b, KEY_DOWN)
+                                        MATRIX_KEY(0x06, 0x0c, KEY_RIGHT)
+
+                                        MATRIX_KEY(0x07, 0x01, KEY_Q)
+                                        MATRIX_KEY(0x07, 0x02, KEY_E)
+                                        MATRIX_KEY(0x07, 0x03, KEY_R)
+                                        MATRIX_KEY(0x07, 0x04, KEY_W)
+                                        MATRIX_KEY(0x07, 0x05, KEY_I)
+                                        MATRIX_KEY(0x07, 0x06, KEY_U)
+                                        MATRIX_KEY(0x07, 0x07, KEY_RIGHTSHIFT)
+                                        MATRIX_KEY(0x07, 0x08, KEY_P)
+                                        MATRIX_KEY(0x07, 0x09, KEY_O)
+                                        MATRIX_KEY(0x07, 0x0b, KEY_UP)
+                                        MATRIX_KEY(0x07, 0x0c, KEY_LEFT)>;
+                       };
+               };
+       };
+
+       pmc@0,7000e400 {
+               nvidia,invert-interrupt;
+               nvidia,suspend-mode = <0>;
+               #wake-cells = <3>;
+               nvidia,cpu-pwr-good-time = <500>;
+               nvidia,cpu-pwr-off-time = <300>;
+               nvidia,core-pwr-good-time = <641 3845>;
+               nvidia,core-pwr-off-time = <61036>;
+               nvidia,core-power-req-active-high;
+               nvidia,sys-clock-req-active-high;
+               nvidia,reset-gpio = <&gpio TEGRA_GPIO(I, 5) GPIO_ACTIVE_LOW>;
+       };
+
+       /* WIFI/BT module */
+       sdhci@0,700b0000 {
+               status = "disabled";
+       };
+
+       /* external SD/MMC */
+       sdhci@0,700b0400 {
+               cd-gpios = <&gpio TEGRA_GPIO(V, 2) GPIO_ACTIVE_LOW>;
+               power-gpios = <&gpio TEGRA_GPIO(R, 0) GPIO_ACTIVE_HIGH>;
+               wp-gpios = <&gpio TEGRA_GPIO(Q, 4) GPIO_ACTIVE_HIGH>;
+               status = "okay";
+               bus-width = <4>;
+               vqmmc-supply = <&vddio_sdmmc3>;
+       };
+
+       /* EMMC 4.51 */
+       sdhci@0,700b0600 {
+               status = "okay";
+               bus-width = <8>;
+               non-removable;
+       };
+
+       usb@0,7d000000 {
+               status = "okay";
+       };
+
+       usb-phy@0,7d000000 {
+               status = "okay";
+               vbus-supply = <&vdd_usb1_vbus>;
+       };
+
+       usb@0,7d004000 {
+               status = "okay";
+       };
+
+       usb-phy@0,7d004000 {
+               status = "okay";
+               vbus-supply = <&vdd_run_cam>;
+       };
+
+       usb@0,7d008000 {
+               status = "okay";
+       };
+
+       usb-phy@0,7d008000 {
+               status = "okay";
+               vbus-supply = <&vdd_usb3_vbus>;
+       };
+
+       backlight: backlight {
+               compatible = "pwm-backlight";
+
+               enable-gpios = <&gpio TEGRA_GPIO(H, 2) GPIO_ACTIVE_HIGH>;
+               power-supply = <&vdd_led>;
+               pwms = <&pwm 1 1000000>;
+
+               brightness-levels = <0 4 8 16 32 64 128 255>;
+               default-brightness-level = <6>;
+
+               backlight-boot-off;
+       };
+
+       clocks {
+               compatible = "simple-bus";
+               #address-cells = <1>;
+               #size-cells = <0>;
+
+               clk32k_in: clock@0 {
+                       compatible = "fixed-clock";
+                       reg=<0>;
+                       #clock-cells = <0>;
+                       clock-frequency = <32768>;
+               };
+       };
+
+       gpio-keys {
+               compatible = "gpio-keys";
+
+               lid {
+                       label = "Lid";
+                       gpios = <&gpio TEGRA_GPIO(R, 4) GPIO_ACTIVE_LOW>;
+                       linux,input-type = <5>;
+                       linux,code = <0>;
+                       debounce-interval = <1>;
+                       gpio-key,wakeup;
+               };
+
+               power {
+                       label = "Power";
+                       gpios = <&gpio TEGRA_GPIO(Q, 0) GPIO_ACTIVE_LOW>;
+                       linux,code = <KEY_POWER>;
+                       debounce-interval = <10>;
+                       gpio-key,wakeup;
+               };
+       };
+
+       panel: panel {
+               compatible = "innolux,n116bge", "simple-panel";
+               backlight = <&backlight>;
+               ddc-i2c-bus = <&dpaux>;
+       };
+
+       regulators {
+               compatible = "simple-bus";
+               #address-cells = <1>;
+               #size-cells = <0>;
+
+               vdd_mux: regulator@0 {
+                       compatible = "regulator-fixed";
+                       reg = <0>;
+                       regulator-name = "+VDD_MUX";
+                       regulator-min-microvolt = <19000000>;
+                       regulator-max-microvolt = <19000000>;
+                       regulator-always-on;
+                       regulator-boot-on;
+               };
+
+               vdd_5v0_sys: regulator@1 {
+                       compatible = "regulator-fixed";
+                       reg = <1>;
+                       regulator-name = "+5V_SYS";
+                       regulator-min-microvolt = <5000000>;
+                       regulator-max-microvolt = <5000000>;
+                       regulator-always-on;
+                       regulator-boot-on;
+                       vin-supply = <&vdd_mux>;
+               };
+
+               vdd_3v3_sys: regulator@2 {
+                       compatible = "regulator-fixed";
+                       reg = <2>;
+                       regulator-name = "+3.3V_SYS";
+                       regulator-min-microvolt = <3300000>;
+                       regulator-max-microvolt = <3300000>;
+                       regulator-always-on;
+                       regulator-boot-on;
+                       vin-supply = <&vdd_mux>;
+               };
+
+               vdd_3v3_run: regulator@3 {
+                       compatible = "regulator-fixed";
+                       reg = <3>;
+                       regulator-name = "+3.3V_RUN";
+                       regulator-min-microvolt = <3300000>;
+                       regulator-max-microvolt = <3300000>;
+                       regulator-always-on;
+                       regulator-boot-on;
+                       gpio = <&as3722 1 GPIO_ACTIVE_HIGH>;
+                       enable-active-high;
+                       vin-supply = <&vdd_3v3_sys>;
+               };
+
+               vdd_3v3_hdmi: regulator@4 {
+                       compatible = "regulator-fixed";
+                       reg = <4>;
+                       regulator-name = "+3.3V_AVDD_HDMI_AP_GATED";
+                       regulator-min-microvolt = <3300000>;
+                       regulator-max-microvolt = <3300000>;
+                       vin-supply = <&vdd_3v3_run>;
+               };
+
+               vdd_led: regulator@5 {
+                       compatible = "regulator-fixed";
+                       reg = <5>;
+                       regulator-name = "+VDD_LED";
+                       regulator-min-microvolt = <3300000>;
+                       regulator-max-microvolt = <3300000>;
+                       gpio = <&gpio TEGRA_GPIO(P, 2) GPIO_ACTIVE_HIGH>;
+                       enable-active-high;
+                       vin-supply = <&vdd_mux>;
+               };
+
+               vdd_usb1_vbus: regulator@6 {
+                       compatible = "regulator-fixed";
+                       reg = <6>;
+                       regulator-name = "+5V_USB_HS";
+                       regulator-min-microvolt = <5000000>;
+                       regulator-max-microvolt = <5000000>;
+                       gpio = <&gpio TEGRA_GPIO(N, 4) GPIO_ACTIVE_HIGH>;
+                       enable-active-high;
+                       gpio-open-drain;
+                       vin-supply = <&vdd_5v0_sys>;
+               };
+
+               vdd_usb3_vbus: regulator@7 {
+                       compatible = "regulator-fixed";
+                       reg = <7>;
+                       regulator-name = "+5V_USB_SS";
+                       regulator-min-microvolt = <5000000>;
+                       regulator-max-microvolt = <5000000>;
+                       gpio = <&gpio TEGRA_GPIO(N, 5) GPIO_ACTIVE_HIGH>;
+                       enable-active-high;
+                       gpio-open-drain;
+                       vin-supply = <&vdd_5v0_sys>;
+               };
+
+               vdd_3v3_panel: regulator@8 {
+                       compatible = "regulator-fixed";
+                       reg = <8>;
+                       regulator-name = "+3.3V_PANEL";
+                       regulator-min-microvolt = <3300000>;
+                       regulator-max-microvolt = <3300000>;
+                       gpio = <&as3722 4 GPIO_ACTIVE_HIGH>;
+                       enable-active-high;
+                       vin-supply = <&vdd_3v3_sys>;
+               };
+
+               vdd_hdmi_pll: regulator@9 {
+                       compatible = "regulator-fixed";
+                       reg = <9>;
+                       regulator-name = "+1.05V_RUN_AVDD_HDMI_PLL_AP_GATE";
+                       regulator-min-microvolt = <1050000>;
+                       regulator-max-microvolt = <1050000>;
+                       gpio = <&gpio TEGRA_GPIO(H, 7) GPIO_ACTIVE_LOW>;
+                       vin-supply = <&vdd_1v05_run>;
+               };
+
+               vdd_5v0_hdmi: regulator@10 {
+                       compatible = "regulator-fixed";
+                       reg = <10>;
+                       regulator-name = "+5V_HDMI_CON";
+                       regulator-min-microvolt = <5000000>;
+                       regulator-max-microvolt = <5000000>;
+                       gpio = <&gpio TEGRA_GPIO(K, 6) GPIO_ACTIVE_HIGH>;
+                       enable-active-high;
+                       vin-supply = <&vdd_5v0_sys>;
+               };
+
+               vdd_5v0_ts: regulator@11 {
+                       compatible = "regulator-fixed";
+                       reg = <11>;
+                       regulator-name = "+5V_VDD_TS";
+                       regulator-min-microvolt = <5000000>;
+                       regulator-max-microvolt = <5000000>;
+                       regulator-always-on;
+                       regulator-boot-on;
+                       gpio = <&gpio TEGRA_GPIO(K, 1) GPIO_ACTIVE_HIGH>;
+                       enable-active-high;
+               };
+       };
+};
diff --git a/arch/arm64/boot/dts/nvidia/tegra132.dtsi b/arch/arm64/boot/dts/nvidia/tegra132.dtsi
new file mode 100644 (file)
index 0000000..e8bb460
--- /dev/null
@@ -0,0 +1,990 @@
+#include <dt-bindings/clock/tegra124-car.h>
+#include <dt-bindings/gpio/tegra-gpio.h>
+#include <dt-bindings/memory/tegra124-mc.h>
+#include <dt-bindings/pinctrl/pinctrl-tegra.h>
+#include <dt-bindings/pinctrl/pinctrl-tegra-xusb.h>
+#include <dt-bindings/interrupt-controller/arm-gic.h>
+
+/ {
+       compatible = "nvidia,tegra132", "nvidia,tegra124";
+       interrupt-parent = <&lic>;
+       #address-cells = <2>;
+       #size-cells = <2>;
+
+       pcie-controller@0,01003000 {
+               compatible = "nvidia,tegra124-pcie";
+               device_type = "pci";
+               reg = <0x0 0x01003000 0x0 0x00000800   /* PADS registers */
+                      0x0 0x01003800 0x0 0x00000800   /* AFI registers */
+                      0x0 0x02000000 0x0 0x10000000>; /* configuration space */
+               reg-names = "pads", "afi", "cs";
+               interrupts = <GIC_SPI 98 IRQ_TYPE_LEVEL_HIGH>, /* controller interrupt */
+                            <GIC_SPI 99 IRQ_TYPE_LEVEL_HIGH>; /* MSI interrupt */
+               interrupt-names = "intr", "msi";
+
+               #interrupt-cells = <1>;
+               interrupt-map-mask = <0 0 0 0>;
+               interrupt-map = <0 0 0 0 &gic GIC_SPI 98 IRQ_TYPE_LEVEL_HIGH>;
+
+               bus-range = <0x00 0xff>;
+               #address-cells = <3>;
+               #size-cells = <2>;
+
+               ranges = <0x82000000 0 0x01000000 0x0 0x01000000 0 0x00001000   /* port 0 configuration space */
+                         0x82000000 0 0x01001000 0x0 0x01001000 0 0x00001000   /* port 1 configuration space */
+                         0x81000000 0 0x0        0x0 0x12000000 0 0x00010000   /* downstream I/O (64 KiB) */
+                         0x82000000 0 0x13000000 0x0 0x13000000 0 0x0d000000   /* non-prefetchable memory (208 MiB) */
+                         0xc2000000 0 0x20000000 0x0 0x20000000 0 0x20000000>; /* prefetchable memory (512 MiB) */
+
+               clocks = <&tegra_car TEGRA124_CLK_PCIE>,
+                        <&tegra_car TEGRA124_CLK_AFI>,
+                        <&tegra_car TEGRA124_CLK_PLL_E>,
+                        <&tegra_car TEGRA124_CLK_CML0>;
+               clock-names = "pex", "afi", "pll_e", "cml";
+               resets = <&tegra_car 70>,
+                        <&tegra_car 72>,
+                        <&tegra_car 74>;
+               reset-names = "pex", "afi", "pcie_x";
+               status = "disabled";
+
+               phys = <&padctl TEGRA_XUSB_PADCTL_PCIE>;
+               phy-names = "pcie";
+
+               pci@1,0 {
+                       device_type = "pci";
+                       assigned-addresses = <0x82000800 0 0x01000000 0 0x1000>;
+                       reg = <0x000800 0 0 0 0>;
+                       status = "disabled";
+
+                       #address-cells = <3>;
+                       #size-cells = <2>;
+                       ranges;
+
+                       nvidia,num-lanes = <2>;
+               };
+
+               pci@2,0 {
+                       device_type = "pci";
+                       assigned-addresses = <0x82001000 0 0x01001000 0 0x1000>;
+                       reg = <0x001000 0 0 0 0>;
+                       status = "disabled";
+
+                       #address-cells = <3>;
+                       #size-cells = <2>;
+                       ranges;
+
+                       nvidia,num-lanes = <1>;
+               };
+       };
+
+       host1x@0,50000000 {
+               compatible = "nvidia,tegra124-host1x", "simple-bus";
+               reg = <0x0 0x50000000 0x0 0x00034000>;
+               interrupts = <GIC_SPI 65 IRQ_TYPE_LEVEL_HIGH>, /* syncpt */
+                            <GIC_SPI 67 IRQ_TYPE_LEVEL_HIGH>; /* general */
+               clocks = <&tegra_car TEGRA124_CLK_HOST1X>;
+               clock-names = "host1x";
+               resets = <&tegra_car 28>;
+               reset-names = "host1x";
+
+               #address-cells = <2>;
+               #size-cells = <2>;
+
+               ranges = <0 0x54000000 0 0x54000000 0 0x01000000>;
+
+               dc@0,54200000 {
+                       compatible = "nvidia,tegra124-dc";
+                       reg = <0x0 0x54200000 0x0 0x00040000>;
+                       interrupts = <GIC_SPI 73 IRQ_TYPE_LEVEL_HIGH>;
+                       clocks = <&tegra_car TEGRA124_CLK_DISP1>,
+                                <&tegra_car TEGRA124_CLK_PLL_P>;
+                       clock-names = "dc", "parent";
+                       resets = <&tegra_car 27>;
+                       reset-names = "dc";
+
+                       iommus = <&mc TEGRA_SWGROUP_DC>;
+
+                       nvidia,head = <0>;
+               };
+
+               dc@0,54240000 {
+                       compatible = "nvidia,tegra124-dc";
+                       reg = <0x0 0x54240000 0x0 0x00040000>;
+                       interrupts = <GIC_SPI 74 IRQ_TYPE_LEVEL_HIGH>;
+                       clocks = <&tegra_car TEGRA124_CLK_DISP2>,
+                                <&tegra_car TEGRA124_CLK_PLL_P>;
+                       clock-names = "dc", "parent";
+                       resets = <&tegra_car 26>;
+                       reset-names = "dc";
+
+                       iommus = <&mc TEGRA_SWGROUP_DCB>;
+
+                       nvidia,head = <1>;
+               };
+
+               hdmi@0,54280000 {
+                       compatible = "nvidia,tegra124-hdmi";
+                       reg = <0x0 0x54280000 0x0 0x00040000>;
+                       interrupts = <GIC_SPI 75 IRQ_TYPE_LEVEL_HIGH>;
+                       clocks = <&tegra_car TEGRA124_CLK_HDMI>,
+                                <&tegra_car TEGRA124_CLK_PLL_D2_OUT0>;
+                       clock-names = "hdmi", "parent";
+                       resets = <&tegra_car 51>;
+                       reset-names = "hdmi";
+                       status = "disabled";
+               };
+
+               sor@0,54540000 {
+                       compatible = "nvidia,tegra124-sor";
+                       reg = <0x0 0x54540000 0x0 0x00040000>;
+                       interrupts = <GIC_SPI 76 IRQ_TYPE_LEVEL_HIGH>;
+                       clocks = <&tegra_car TEGRA124_CLK_SOR0>,
+                                <&tegra_car TEGRA124_CLK_PLL_D_OUT0>,
+                                <&tegra_car TEGRA124_CLK_PLL_DP>,
+                                <&tegra_car TEGRA124_CLK_CLK_M>;
+                       clock-names = "sor", "parent", "dp", "safe";
+                       resets = <&tegra_car 182>;
+                       reset-names = "sor";
+                       status = "disabled";
+               };
+
+               dpaux: dpaux@0,545c0000 {
+                       compatible = "nvidia,tegra124-dpaux";
+                       reg = <0x0 0x545c0000 0x0 0x00040000>;
+                       interrupts = <GIC_SPI 159 IRQ_TYPE_LEVEL_HIGH>;
+                       clocks = <&tegra_car TEGRA124_CLK_DPAUX>,
+                                <&tegra_car TEGRA124_CLK_PLL_DP>;
+                       clock-names = "dpaux", "parent";
+                       resets = <&tegra_car 181>;
+                       reset-names = "dpaux";
+                       status = "disabled";
+               };
+       };
+
+       gic: interrupt-controller@0,50041000 {
+               compatible = "arm,cortex-a15-gic";
+               #interrupt-cells = <3>;
+               interrupt-controller;
+               reg = <0x0 0x50041000 0x0 0x1000>,
+                     <0x0 0x50042000 0x0 0x2000>,
+                     <0x0 0x50044000 0x0 0x2000>,
+                     <0x0 0x50046000 0x0 0x2000>;
+               interrupts = <GIC_PPI 9
+                       (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_HIGH)>;
+               interrupt-parent = <&gic>;
+       };
+
+       gpu@0,57000000 {
+               compatible = "nvidia,gk20a";
+               reg = <0x0 0x57000000 0x0 0x01000000>,
+                     <0x0 0x58000000 0x0 0x01000000>;
+               interrupts = <GIC_SPI 157 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 158 IRQ_TYPE_LEVEL_HIGH>;
+               interrupt-names = "stall", "nonstall";
+               clocks = <&tegra_car TEGRA124_CLK_GPU>,
+                        <&tegra_car TEGRA124_CLK_PLL_P_OUT5>;
+               clock-names = "gpu", "pwr";
+               resets = <&tegra_car 184>;
+               reset-names = "gpu";
+               status = "disabled";
+       };
+
+       lic: interrupt-controller@60004000 {
+               compatible = "nvidia,tegra124-ictlr", "nvidia,tegra30-ictlr";
+               reg = <0x0 0x60004000 0x0 0x100>,
+                     <0x0 0x60004100 0x0 0x100>,
+                     <0x0 0x60004200 0x0 0x100>,
+                     <0x0 0x60004300 0x0 0x100>,
+                     <0x0 0x60004400 0x0 0x100>;
+               interrupt-controller;
+               #interrupt-cells = <3>;
+               interrupt-parent = <&gic>;
+       };
+
+       timer@0,60005000 {
+               compatible = "nvidia,tegra124-timer", "nvidia,tegra20-timer";
+               reg = <0x0 0x60005000 0x0 0x400>;
+               interrupts = <GIC_SPI 0 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 1 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 41 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 42 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 121 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 122 IRQ_TYPE_LEVEL_HIGH>;
+               clocks = <&tegra_car TEGRA124_CLK_TIMER>;
+               clock-names = "timer";
+       };
+
+       tegra_car: clock@0,60006000 {
+               compatible = "nvidia,tegra132-car";
+               reg = <0x0 0x60006000 0x0 0x1000>;
+               #clock-cells = <1>;
+               #reset-cells = <1>;
+               nvidia,external-memory-controller = <&emc>;
+       };
+
+       flow-controller@0,60007000 {
+               compatible = "nvidia,tegra124-flowctrl";
+               reg = <0x0 0x60007000 0x0 0x1000>;
+       };
+
+       actmon@0,6000c800 {
+               compatible = "nvidia,tegra124-actmon";
+               reg = <0x0 0x6000c800 0x0 0x400>;
+               interrupts = <GIC_SPI 45 IRQ_TYPE_LEVEL_HIGH>;
+               clocks = <&tegra_car TEGRA124_CLK_ACTMON>,
+                        <&tegra_car TEGRA124_CLK_EMC>;
+               clock-names = "actmon", "emc";
+               resets = <&tegra_car 119>;
+               reset-names = "actmon";
+       };
+
+       gpio: gpio@0,6000d000 {
+               compatible = "nvidia,tegra124-gpio", "nvidia,tegra30-gpio";
+               reg = <0x0 0x6000d000 0x0 0x1000>;
+               interrupts = <GIC_SPI 32 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 33 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 34 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 35 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 55 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 87 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 89 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 125 IRQ_TYPE_LEVEL_HIGH>;
+               #gpio-cells = <2>;
+               gpio-controller;
+               #interrupt-cells = <2>;
+               interrupt-controller;
+       };
+
+       apbdma: dma@0,60020000 {
+               compatible = "nvidia,tegra124-apbdma", "nvidia,tegra148-apbdma";
+               reg = <0x0 0x60020000 0x0 0x1400>;
+               interrupts = <GIC_SPI 104 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 105 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 106 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 107 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 108 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 109 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 110 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 111 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 112 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 113 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 114 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 115 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 116 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 117 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 118 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 119 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 128 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 129 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 130 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 131 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 132 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 133 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 134 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 135 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 136 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 137 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 138 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 139 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 140 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 141 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 142 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 143 IRQ_TYPE_LEVEL_HIGH>;
+               clocks = <&tegra_car TEGRA124_CLK_APBDMA>;
+               clock-names = "dma";
+               resets = <&tegra_car 34>;
+               reset-names = "dma";
+               #dma-cells = <1>;
+       };
+
+       apbmisc@0,70000800 {
+               compatible = "nvidia,tegra124-apbmisc", "nvidia,tegra20-apbmisc";
+               reg = <0x0 0x70000800 0x0 0x64>,   /* Chip revision */
+                     <0x0 0x7000e864 0x0 0x04>;   /* Strapping options */
+       };
+
+       pinmux: pinmux@0,70000868 {
+               compatible = "nvidia,tegra124-pinmux";
+               reg = <0x0 0x70000868 0x0 0x164>, /* Pad control registers */
+                     <0x0 0x70003000 0x0 0x434>, /* Mux registers */
+                     <0x0 0x70000820 0x0 0x008>; /* MIPI pad control */
+       };
+
+       /*
+        * There are two serial driver i.e. 8250 based simple serial
+        * driver and APB DMA based serial driver for higher baudrate
+        * and performace. To enable the 8250 based driver, the compatible
+        * is "nvidia,tegra124-uart", "nvidia,tegra20-uart" and to enable
+        * the APB DMA based serial driver, the comptible is
+        * "nvidia,tegra124-hsuart", "nvidia,tegra30-hsuart".
+        */
+       uarta: serial@0,70006000 {
+               compatible = "nvidia,tegra124-uart", "nvidia,tegra20-uart";
+               reg = <0x0 0x70006000 0x0 0x40>;
+               reg-shift = <2>;
+               interrupts = <GIC_SPI 36 IRQ_TYPE_LEVEL_HIGH>;
+               clocks = <&tegra_car TEGRA124_CLK_UARTA>;
+               clock-names = "serial";
+               resets = <&tegra_car 6>;
+               reset-names = "serial";
+               dmas = <&apbdma 8>, <&apbdma 8>;
+               dma-names = "rx", "tx";
+               status = "disabled";
+       };
+
+       uartb: serial@0,70006040 {
+               compatible = "nvidia,tegra124-uart", "nvidia,tegra20-uart";
+               reg = <0x0 0x70006040 0x0 0x40>;
+               reg-shift = <2>;
+               interrupts = <GIC_SPI 37 IRQ_TYPE_LEVEL_HIGH>;
+               clocks = <&tegra_car TEGRA124_CLK_UARTB>;
+               clock-names = "serial";
+               resets = <&tegra_car 7>;
+               reset-names = "serial";
+               dmas = <&apbdma 9>, <&apbdma 9>;
+               dma-names = "rx", "tx";
+               status = "disabled";
+       };
+
+       uartc: serial@0,70006200 {
+               compatible = "nvidia,tegra124-uart", "nvidia,tegra20-uart";
+               reg = <0x0 0x70006200 0x0 0x40>;
+               reg-shift = <2>;
+               interrupts = <GIC_SPI 46 IRQ_TYPE_LEVEL_HIGH>;
+               clocks = <&tegra_car TEGRA124_CLK_UARTC>;
+               clock-names = "serial";
+               resets = <&tegra_car 55>;
+               reset-names = "serial";
+               dmas = <&apbdma 10>, <&apbdma 10>;
+               dma-names = "rx", "tx";
+               status = "disabled";
+       };
+
+       uartd: serial@0,70006300 {
+               compatible = "nvidia,tegra124-uart", "nvidia,tegra20-uart";
+               reg = <0x0 0x70006300 0x0 0x40>;
+               reg-shift = <2>;
+               interrupts = <GIC_SPI 90 IRQ_TYPE_LEVEL_HIGH>;
+               clocks = <&tegra_car TEGRA124_CLK_UARTD>;
+               clock-names = "serial";
+               resets = <&tegra_car 65>;
+               reset-names = "serial";
+               dmas = <&apbdma 19>, <&apbdma 19>;
+               dma-names = "rx", "tx";
+               status = "disabled";
+       };
+
+       pwm: pwm@0,7000a000 {
+               compatible = "nvidia,tegra124-pwm", "nvidia,tegra20-pwm";
+               reg = <0x0 0x7000a000 0x0 0x100>;
+               #pwm-cells = <2>;
+               clocks = <&tegra_car TEGRA124_CLK_PWM>;
+               clock-names = "pwm";
+               resets = <&tegra_car 17>;
+               reset-names = "pwm";
+               status = "disabled";
+       };
+
+       i2c@0,7000c000 {
+               compatible = "nvidia,tegra124-i2c", "nvidia,tegra114-i2c";
+               reg = <0x0 0x7000c000 0x0 0x100>;
+               interrupts = <GIC_SPI 38 IRQ_TYPE_LEVEL_HIGH>;
+               #address-cells = <1>;
+               #size-cells = <0>;
+               clocks = <&tegra_car TEGRA124_CLK_I2C1>;
+               clock-names = "div-clk";
+               resets = <&tegra_car 12>;
+               reset-names = "i2c";
+               dmas = <&apbdma 21>, <&apbdma 21>;
+               dma-names = "rx", "tx";
+               status = "disabled";
+       };
+
+       i2c@0,7000c400 {
+               compatible = "nvidia,tegra124-i2c", "nvidia,tegra114-i2c";
+               reg = <0x0 0x7000c400 0x0 0x100>;
+               interrupts = <GIC_SPI 84 IRQ_TYPE_LEVEL_HIGH>;
+               #address-cells = <1>;
+               #size-cells = <0>;
+               clocks = <&tegra_car TEGRA124_CLK_I2C2>;
+               clock-names = "div-clk";
+               resets = <&tegra_car 54>;
+               reset-names = "i2c";
+               dmas = <&apbdma 22>, <&apbdma 22>;
+               dma-names = "rx", "tx";
+               status = "disabled";
+       };
+
+       i2c@0,7000c500 {
+               compatible = "nvidia,tegra124-i2c", "nvidia,tegra114-i2c";
+               reg = <0x0 0x7000c500 0x0 0x100>;
+               interrupts = <GIC_SPI 92 IRQ_TYPE_LEVEL_HIGH>;
+               #address-cells = <1>;
+               #size-cells = <0>;
+               clocks = <&tegra_car TEGRA124_CLK_I2C3>;
+               clock-names = "div-clk";
+               resets = <&tegra_car 67>;
+               reset-names = "i2c";
+               dmas = <&apbdma 23>, <&apbdma 23>;
+               dma-names = "rx", "tx";
+               status = "disabled";
+       };
+
+       i2c@0,7000c700 {
+               compatible = "nvidia,tegra124-i2c", "nvidia,tegra114-i2c";
+               reg = <0x0 0x7000c700 0x0 0x100>;
+               interrupts = <GIC_SPI 120 IRQ_TYPE_LEVEL_HIGH>;
+               #address-cells = <1>;
+               #size-cells = <0>;
+               clocks = <&tegra_car TEGRA124_CLK_I2C4>;
+               clock-names = "div-clk";
+               resets = <&tegra_car 103>;
+               reset-names = "i2c";
+               dmas = <&apbdma 26>, <&apbdma 26>;
+               dma-names = "rx", "tx";
+               status = "disabled";
+       };
+
+       i2c@0,7000d000 {
+               compatible = "nvidia,tegra124-i2c", "nvidia,tegra114-i2c";
+               reg = <0x0 0x7000d000 0x0 0x100>;
+               interrupts = <GIC_SPI 53 IRQ_TYPE_LEVEL_HIGH>;
+               #address-cells = <1>;
+               #size-cells = <0>;
+               clocks = <&tegra_car TEGRA124_CLK_I2C5>;
+               clock-names = "div-clk";
+               resets = <&tegra_car 47>;
+               reset-names = "i2c";
+               dmas = <&apbdma 24>, <&apbdma 24>;
+               dma-names = "rx", "tx";
+               status = "disabled";
+       };
+
+       i2c@0,7000d100 {
+               compatible = "nvidia,tegra124-i2c", "nvidia,tegra114-i2c";
+               reg = <0x0 0x7000d100 0x0 0x100>;
+               interrupts = <GIC_SPI 63 IRQ_TYPE_LEVEL_HIGH>;
+               #address-cells = <1>;
+               #size-cells = <0>;
+               clocks = <&tegra_car TEGRA124_CLK_I2C6>;
+               clock-names = "div-clk";
+               resets = <&tegra_car 166>;
+               reset-names = "i2c";
+               dmas = <&apbdma 30>, <&apbdma 30>;
+               dma-names = "rx", "tx";
+               status = "disabled";
+       };
+
+       spi@0,7000d400 {
+               compatible = "nvidia,tegra124-spi", "nvidia,tegra114-spi";
+               reg = <0x0 0x7000d400 0x0 0x200>;
+               interrupts = <GIC_SPI 59 IRQ_TYPE_LEVEL_HIGH>;
+               #address-cells = <1>;
+               #size-cells = <0>;
+               clocks = <&tegra_car TEGRA124_CLK_SBC1>;
+               clock-names = "spi";
+               resets = <&tegra_car 41>;
+               reset-names = "spi";
+               dmas = <&apbdma 15>, <&apbdma 15>;
+               dma-names = "rx", "tx";
+               status = "disabled";
+       };
+
+       spi@0,7000d600 {
+               compatible = "nvidia,tegra124-spi", "nvidia,tegra114-spi";
+               reg = <0x0 0x7000d600 0x0 0x200>;
+               interrupts = <GIC_SPI 82 IRQ_TYPE_LEVEL_HIGH>;
+               #address-cells = <1>;
+               #size-cells = <0>;
+               clocks = <&tegra_car TEGRA124_CLK_SBC2>;
+               clock-names = "spi";
+               resets = <&tegra_car 44>;
+               reset-names = "spi";
+               dmas = <&apbdma 16>, <&apbdma 16>;
+               dma-names = "rx", "tx";
+               status = "disabled";
+       };
+
+       spi@0,7000d800 {
+               compatible = "nvidia,tegra124-spi", "nvidia,tegra114-spi";
+               reg = <0x0 0x7000d800 0x0 0x200>;
+               interrupts = <GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>;
+               #address-cells = <1>;
+               #size-cells = <0>;
+               clocks = <&tegra_car TEGRA124_CLK_SBC3>;
+               clock-names = "spi";
+               resets = <&tegra_car 46>;
+               reset-names = "spi";
+               dmas = <&apbdma 17>, <&apbdma 17>;
+               dma-names = "rx", "tx";
+               status = "disabled";
+       };
+
+       spi@0,7000da00 {
+               compatible = "nvidia,tegra124-spi", "nvidia,tegra114-spi";
+               reg = <0x0 0x7000da00 0x0 0x200>;
+               interrupts = <GIC_SPI 93 IRQ_TYPE_LEVEL_HIGH>;
+               #address-cells = <1>;
+               #size-cells = <0>;
+               clocks = <&tegra_car TEGRA124_CLK_SBC4>;
+               clock-names = "spi";
+               resets = <&tegra_car 68>;
+               reset-names = "spi";
+               dmas = <&apbdma 18>, <&apbdma 18>;
+               dma-names = "rx", "tx";
+               status = "disabled";
+       };
+
+       spi@0,7000dc00 {
+               compatible = "nvidia,tegra124-spi", "nvidia,tegra114-spi";
+               reg = <0x0 0x7000dc00 0x0 0x200>;
+               interrupts = <GIC_SPI 94 IRQ_TYPE_LEVEL_HIGH>;
+               #address-cells = <1>;
+               #size-cells = <0>;
+               clocks = <&tegra_car TEGRA124_CLK_SBC5>;
+               clock-names = "spi";
+               resets = <&tegra_car 104>;
+               reset-names = "spi";
+               dmas = <&apbdma 27>, <&apbdma 27>;
+               dma-names = "rx", "tx";
+               status = "disabled";
+       };
+
+       spi@0,7000de00 {
+               compatible = "nvidia,tegra124-spi", "nvidia,tegra114-spi";
+               reg = <0x0 0x7000de00 0x0 0x200>;
+               interrupts = <GIC_SPI 79 IRQ_TYPE_LEVEL_HIGH>;
+               #address-cells = <1>;
+               #size-cells = <0>;
+               clocks = <&tegra_car TEGRA124_CLK_SBC6>;
+               clock-names = "spi";
+               resets = <&tegra_car 105>;
+               reset-names = "spi";
+               dmas = <&apbdma 28>, <&apbdma 28>;
+               dma-names = "rx", "tx";
+               status = "disabled";
+       };
+
+       rtc@0,7000e000 {
+               compatible = "nvidia,tegra124-rtc", "nvidia,tegra20-rtc";
+               reg = <0x0 0x7000e000 0x0 0x100>;
+               interrupts = <GIC_SPI 2 IRQ_TYPE_LEVEL_HIGH>;
+               clocks = <&tegra_car TEGRA124_CLK_RTC>;
+               clock-names = "rtc";
+       };
+
+       pmc@0,7000e400 {
+               compatible = "nvidia,tegra124-pmc";
+               reg = <0x0 0x7000e400 0x0 0x400>;
+               clocks = <&tegra_car TEGRA124_CLK_PCLK>, <&clk32k_in>;
+               clock-names = "pclk", "clk32k_in";
+       };
+
+       fuse@0,7000f800 {
+               compatible = "nvidia,tegra124-efuse";
+               reg = <0x0 0x7000f800 0x0 0x400>;
+               clocks = <&tegra_car TEGRA124_CLK_FUSE>;
+               clock-names = "fuse";
+               resets = <&tegra_car 39>;
+               reset-names = "fuse";
+       };
+
+       mc: memory-controller@0,70019000 {
+               compatible = "nvidia,tegra132-mc";
+               reg = <0x0 0x70019000 0x0 0x1000>;
+               clocks = <&tegra_car TEGRA124_CLK_MC>;
+               clock-names = "mc";
+
+               interrupts = <GIC_SPI 77 IRQ_TYPE_LEVEL_HIGH>;
+
+               #iommu-cells = <1>;
+       };
+
+       emc: emc@0,7001b000 {
+               compatible = "nvidia,tegra132-emc", "nvidia,tegra124-emc";
+               reg = <0x0 0x7001b000 0x0 0x1000>;
+
+               nvidia,memory-controller = <&mc>;
+       };
+
+       sata@0,70020000 {
+               compatible = "nvidia,tegra124-ahci";
+               reg = <0x0 0x70027000 0x0 0x2000>, /* AHCI */
+                     <0x0 0x70020000 0x0 0x7000>; /* SATA */
+               interrupts = <GIC_SPI 23 IRQ_TYPE_LEVEL_HIGH>;
+               clocks = <&tegra_car TEGRA124_CLK_SATA>,
+                        <&tegra_car TEGRA124_CLK_SATA_OOB>,
+                        <&tegra_car TEGRA124_CLK_CML1>,
+                        <&tegra_car TEGRA124_CLK_PLL_E>;
+               clock-names = "sata", "sata-oob", "cml1", "pll_e";
+               resets = <&tegra_car 124>,
+                        <&tegra_car 123>,
+                        <&tegra_car 129>;
+               reset-names = "sata", "sata-oob", "sata-cold";
+               phys = <&padctl TEGRA_XUSB_PADCTL_SATA>;
+               phy-names = "sata-phy";
+               status = "disabled";
+       };
+
+       hda@0,70030000 {
+               compatible = "nvidia,tegra132-hda", "nvidia,tegra124-hda",
+                            "nvidia,tegra30-hda";
+               reg = <0x0 0x70030000 0x0 0x10000>;
+               interrupts = <GIC_SPI 81 IRQ_TYPE_LEVEL_HIGH>;
+               clocks = <&tegra_car TEGRA124_CLK_HDA>,
+                        <&tegra_car TEGRA124_CLK_HDA2HDMI>,
+                        <&tegra_car TEGRA124_CLK_HDA2CODEC_2X>;
+               clock-names = "hda", "hda2hdmi", "hda2codec_2x";
+               resets = <&tegra_car 125>, /* hda */
+                        <&tegra_car 128>, /* hda2hdmi */
+                        <&tegra_car 111>; /* hda2codec_2x */
+               reset-names = "hda", "hda2hdmi", "hda2codec_2x";
+               status = "disabled";
+       };
+
+       padctl: padctl@0,7009f000 {
+               compatible = "nvidia,tegra132-xusb-padctl",
+                            "nvidia,tegra124-xusb-padctl";
+               reg = <0x0 0x7009f000 0x0 0x1000>;
+               resets = <&tegra_car 142>;
+               reset-names = "padctl";
+
+               #phy-cells = <1>;
+
+               phys {
+                       pcie-0 {
+                               status = "disabled";
+                       };
+
+                       sata-0 {
+                               status = "disabled";
+                       };
+
+                       usb3-0 {
+                               status = "disabled";
+                       };
+
+                       usb3-1 {
+                               status = "disabled";
+                       };
+
+                       utmi-0 {
+                               status = "disabled";
+                       };
+
+                       utmi-1 {
+                               status = "disabled";
+                       };
+
+                       utmi-2 {
+                               status = "disabled";
+                       };
+               };
+       };
+
+       sdhci@0,700b0000 {
+               compatible = "nvidia,tegra124-sdhci";
+               reg = <0x0 0x700b0000 0x0 0x200>;
+               interrupts = <GIC_SPI 14 IRQ_TYPE_LEVEL_HIGH>;
+               clocks = <&tegra_car TEGRA124_CLK_SDMMC1>;
+               clock-names = "sdhci";
+               resets = <&tegra_car 14>;
+               reset-names = "sdhci";
+               status = "disabled";
+       };
+
+       sdhci@0,700b0200 {
+               compatible = "nvidia,tegra124-sdhci";
+               reg = <0x0 0x700b0200 0x0 0x200>;
+               interrupts = <GIC_SPI 15 IRQ_TYPE_LEVEL_HIGH>;
+               clocks = <&tegra_car TEGRA124_CLK_SDMMC2>;
+               clock-names = "sdhci";
+               resets = <&tegra_car 9>;
+               reset-names = "sdhci";
+               status = "disabled";
+       };
+
+       sdhci@0,700b0400 {
+               compatible = "nvidia,tegra124-sdhci";
+               reg = <0x0 0x700b0400 0x0 0x200>;
+               interrupts = <GIC_SPI 19 IRQ_TYPE_LEVEL_HIGH>;
+               clocks = <&tegra_car TEGRA124_CLK_SDMMC3>;
+               clock-names = "sdhci";
+               resets = <&tegra_car 69>;
+               reset-names = "sdhci";
+               status = "disabled";
+       };
+
+       sdhci@0,700b0600 {
+               compatible = "nvidia,tegra124-sdhci";
+               reg = <0x0 0x700b0600 0x0 0x200>;
+               interrupts = <GIC_SPI 31 IRQ_TYPE_LEVEL_HIGH>;
+               clocks = <&tegra_car TEGRA124_CLK_SDMMC4>;
+               clock-names = "sdhci";
+               resets = <&tegra_car 15>;
+               reset-names = "sdhci";
+               status = "disabled";
+       };
+
+       soctherm: thermal-sensor@0,700e2000 {
+               compatible = "nvidia,tegra124-soctherm";
+               reg = <0x0 0x700e2000 0x0 0x1000>;
+               interrupts = <GIC_SPI 48 IRQ_TYPE_LEVEL_HIGH>;
+               clocks = <&tegra_car TEGRA124_CLK_TSENSOR>,
+                       <&tegra_car TEGRA124_CLK_SOC_THERM>;
+               clock-names = "tsensor", "soctherm";
+               resets = <&tegra_car 78>;
+               reset-names = "soctherm";
+               #thermal-sensor-cells = <1>;
+       };
+
+       ahub@0,70300000 {
+               compatible = "nvidia,tegra124-ahub";
+               reg = <0x0 0x70300000 0x0 0x200>,
+                     <0x0 0x70300800 0x0 0x800>,
+                     <0x0 0x70300200 0x0 0x600>;
+               interrupts = <GIC_SPI 103 IRQ_TYPE_LEVEL_HIGH>;
+               clocks = <&tegra_car TEGRA124_CLK_D_AUDIO>,
+                        <&tegra_car TEGRA124_CLK_APBIF>;
+               clock-names = "d_audio", "apbif";
+               resets = <&tegra_car 106>, /* d_audio */
+                        <&tegra_car 107>, /* apbif */
+                        <&tegra_car 30>,  /* i2s0 */
+                        <&tegra_car 11>,  /* i2s1 */
+                        <&tegra_car 18>,  /* i2s2 */
+                        <&tegra_car 101>, /* i2s3 */
+                        <&tegra_car 102>, /* i2s4 */
+                        <&tegra_car 108>, /* dam0 */
+                        <&tegra_car 109>, /* dam1 */
+                        <&tegra_car 110>, /* dam2 */
+                        <&tegra_car 10>,  /* spdif */
+                        <&tegra_car 153>, /* amx */
+                        <&tegra_car 185>, /* amx1 */
+                        <&tegra_car 154>, /* adx */
+                        <&tegra_car 180>, /* adx1 */
+                        <&tegra_car 186>, /* afc0 */
+                        <&tegra_car 187>, /* afc1 */
+                        <&tegra_car 188>, /* afc2 */
+                        <&tegra_car 189>, /* afc3 */
+                        <&tegra_car 190>, /* afc4 */
+                        <&tegra_car 191>; /* afc5 */
+               reset-names = "d_audio", "apbif", "i2s0", "i2s1", "i2s2",
+                             "i2s3", "i2s4", "dam0", "dam1", "dam2",
+                             "spdif", "amx", "amx1", "adx", "adx1",
+                             "afc0", "afc1", "afc2", "afc3", "afc4", "afc5";
+               dmas = <&apbdma 1>, <&apbdma 1>,
+                      <&apbdma 2>, <&apbdma 2>,
+                      <&apbdma 3>, <&apbdma 3>,
+                      <&apbdma 4>, <&apbdma 4>,
+                      <&apbdma 6>, <&apbdma 6>,
+                      <&apbdma 7>, <&apbdma 7>,
+                      <&apbdma 12>, <&apbdma 12>,
+                      <&apbdma 13>, <&apbdma 13>,
+                      <&apbdma 14>, <&apbdma 14>,
+                      <&apbdma 29>, <&apbdma 29>;
+               dma-names = "rx0", "tx0", "rx1", "tx1", "rx2", "tx2",
+                           "rx3", "tx3", "rx4", "tx4", "rx5", "tx5",
+                           "rx6", "tx6", "rx7", "tx7", "rx8", "tx8",
+                           "rx9", "tx9";
+               ranges;
+               #address-cells = <2>;
+               #size-cells = <2>;
+
+               tegra_i2s0: i2s@0,70301000 {
+                       compatible = "nvidia,tegra124-i2s";
+                       reg = <0x0 0x70301000 0x0 0x100>;
+                       nvidia,ahub-cif-ids = <4 4>;
+                       clocks = <&tegra_car TEGRA124_CLK_I2S0>;
+                       clock-names = "i2s";
+                       resets = <&tegra_car 30>;
+                       reset-names = "i2s";
+                       status = "disabled";
+               };
+
+               tegra_i2s1: i2s@0,70301100 {
+                       compatible = "nvidia,tegra124-i2s";
+                       reg = <0x0 0x70301100 0x0 0x100>;
+                       nvidia,ahub-cif-ids = <5 5>;
+                       clocks = <&tegra_car TEGRA124_CLK_I2S1>;
+                       clock-names = "i2s";
+                       resets = <&tegra_car 11>;
+                       reset-names = "i2s";
+                       status = "disabled";
+               };
+
+               tegra_i2s2: i2s@0,70301200 {
+                       compatible = "nvidia,tegra124-i2s";
+                       reg = <0x0 0x70301200 0x0 0x100>;
+                       nvidia,ahub-cif-ids = <6 6>;
+                       clocks = <&tegra_car TEGRA124_CLK_I2S2>;
+                       clock-names = "i2s";
+                       resets = <&tegra_car 18>;
+                       reset-names = "i2s";
+                       status = "disabled";
+               };
+
+               tegra_i2s3: i2s@0,70301300 {
+                       compatible = "nvidia,tegra124-i2s";
+                       reg = <0x0 0x70301300 0x0 0x100>;
+                       nvidia,ahub-cif-ids = <7 7>;
+                       clocks = <&tegra_car TEGRA124_CLK_I2S3>;
+                       clock-names = "i2s";
+                       resets = <&tegra_car 101>;
+                       reset-names = "i2s";
+                       status = "disabled";
+               };
+
+               tegra_i2s4: i2s@0,70301400 {
+                       compatible = "nvidia,tegra124-i2s";
+                       reg = <0x0 0x70301400 0x0 0x100>;
+                       nvidia,ahub-cif-ids = <8 8>;
+                       clocks = <&tegra_car TEGRA124_CLK_I2S4>;
+                       clock-names = "i2s";
+                       resets = <&tegra_car 102>;
+                       reset-names = "i2s";
+                       status = "disabled";
+               };
+       };
+
+       usb@0,7d000000 {
+               compatible = "nvidia,tegra124-ehci", "nvidia,tegra30-ehci", "usb-ehci";
+               reg = <0x0 0x7d000000 0x0 0x4000>;
+               interrupts = <GIC_SPI 20 IRQ_TYPE_LEVEL_HIGH>;
+               phy_type = "utmi";
+               clocks = <&tegra_car TEGRA124_CLK_USBD>;
+               clock-names = "usb";
+               resets = <&tegra_car 22>;
+               reset-names = "usb";
+               nvidia,phy = <&phy1>;
+               status = "disabled";
+       };
+
+       phy1: usb-phy@0,7d000000 {
+               compatible = "nvidia,tegra124-usb-phy", "nvidia,tegra30-usb-phy";
+               reg = <0x0 0x7d000000 0x0 0x4000>,
+                     <0x0 0x7d000000 0x0 0x4000>;
+               phy_type = "utmi";
+               clocks = <&tegra_car TEGRA124_CLK_USBD>,
+                        <&tegra_car TEGRA124_CLK_PLL_U>,
+                        <&tegra_car TEGRA124_CLK_USBD>;
+               clock-names = "reg", "pll_u", "utmi-pads";
+               resets = <&tegra_car 22>, <&tegra_car 22>;
+               reset-names = "usb", "utmi-pads";
+               nvidia,hssync-start-delay = <0>;
+               nvidia,idle-wait-delay = <17>;
+               nvidia,elastic-limit = <16>;
+               nvidia,term-range-adj = <6>;
+               nvidia,xcvr-setup = <9>;
+               nvidia,xcvr-lsfslew = <0>;
+               nvidia,xcvr-lsrslew = <3>;
+               nvidia,hssquelch-level = <2>;
+               nvidia,hsdiscon-level = <5>;
+               nvidia,xcvr-hsslew = <12>;
+               nvidia,has-utmi-pad-registers;
+               status = "disabled";
+       };
+
+       usb@0,7d004000 {
+               compatible = "nvidia,tegra124-ehci", "nvidia,tegra30-ehci", "usb-ehci";
+               reg = <0x0 0x7d004000 0x0 0x4000>;
+               interrupts = <GIC_SPI 21 IRQ_TYPE_LEVEL_HIGH>;
+               phy_type = "utmi";
+               clocks = <&tegra_car TEGRA124_CLK_USB2>;
+               clock-names = "usb";
+               resets = <&tegra_car 58>;
+               reset-names = "usb";
+               nvidia,phy = <&phy2>;
+               status = "disabled";
+       };
+
+       phy2: usb-phy@0,7d004000 {
+               compatible = "nvidia,tegra124-usb-phy", "nvidia,tegra30-usb-phy";
+               reg = <0x0 0x7d004000 0x0 0x4000>,
+                     <0x0 0x7d000000 0x0 0x4000>;
+               phy_type = "utmi";
+               clocks = <&tegra_car TEGRA124_CLK_USB2>,
+                        <&tegra_car TEGRA124_CLK_PLL_U>,
+                        <&tegra_car TEGRA124_CLK_USBD>;
+               clock-names = "reg", "pll_u", "utmi-pads";
+               resets = <&tegra_car 58>, <&tegra_car 22>;
+               reset-names = "usb", "utmi-pads";
+               nvidia,hssync-start-delay = <0>;
+               nvidia,idle-wait-delay = <17>;
+               nvidia,elastic-limit = <16>;
+               nvidia,term-range-adj = <6>;
+               nvidia,xcvr-setup = <9>;
+               nvidia,xcvr-lsfslew = <0>;
+               nvidia,xcvr-lsrslew = <3>;
+               nvidia,hssquelch-level = <2>;
+               nvidia,hsdiscon-level = <5>;
+               nvidia,xcvr-hsslew = <12>;
+               status = "disabled";
+       };
+
+       usb@0,7d008000 {
+               compatible = "nvidia,tegra124-ehci", "nvidia,tegra30-ehci", "usb-ehci";
+               reg = <0x0 0x7d008000 0x0 0x4000>;
+               interrupts = <GIC_SPI 97 IRQ_TYPE_LEVEL_HIGH>;
+               phy_type = "utmi";
+               clocks = <&tegra_car TEGRA124_CLK_USB3>;
+               clock-names = "usb";
+               resets = <&tegra_car 59>;
+               reset-names = "usb";
+               nvidia,phy = <&phy3>;
+               status = "disabled";
+       };
+
+       phy3: usb-phy@0,7d008000 {
+               compatible = "nvidia,tegra124-usb-phy", "nvidia,tegra30-usb-phy";
+               reg = <0x0 0x7d008000 0x0 0x4000>,
+                     <0x0 0x7d000000 0x0 0x4000>;
+               phy_type = "utmi";
+               clocks = <&tegra_car TEGRA124_CLK_USB3>,
+                        <&tegra_car TEGRA124_CLK_PLL_U>,
+                        <&tegra_car TEGRA124_CLK_USBD>;
+               clock-names = "reg", "pll_u", "utmi-pads";
+               resets = <&tegra_car 59>, <&tegra_car 22>;
+               reset-names = "usb", "utmi-pads";
+               nvidia,hssync-start-delay = <0>;
+               nvidia,idle-wait-delay = <17>;
+               nvidia,elastic-limit = <16>;
+               nvidia,term-range-adj = <6>;
+               nvidia,xcvr-setup = <9>;
+               nvidia,xcvr-lsfslew = <0>;
+               nvidia,xcvr-lsrslew = <3>;
+               nvidia,hssquelch-level = <2>;
+               nvidia,hsdiscon-level = <5>;
+               nvidia,xcvr-hsslew = <12>;
+               status = "disabled";
+       };
+
+       cpus {
+               #address-cells = <1>;
+               #size-cells = <0>;
+
+               cpu@0 {
+                       device_type = "cpu";
+                       compatible = "nvidia,denver", "arm,armv8";
+                       reg = <0>;
+               };
+
+               cpu@1 {
+                       device_type = "cpu";
+                       compatible = "nvidia,denver", "arm,armv8";
+                       reg = <1>;
+               };
+       };
+
+       timer {
+               compatible = "arm,armv7-timer";
+               interrupts = <GIC_PPI 13
+                               (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_LOW)>,
+                            <GIC_PPI 14
+                               (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_LOW)>,
+                            <GIC_PPI 11
+                               (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_LOW)>,
+                            <GIC_PPI 10
+                               (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_LOW)>;
+               interrupt-parent = <&gic>;
+       };
+};
diff --git a/arch/arm64/boot/dts/nvidia/tegra210-p2180.dtsi b/arch/arm64/boot/dts/nvidia/tegra210-p2180.dtsi
new file mode 100644 (file)
index 0000000..2b7f889
--- /dev/null
@@ -0,0 +1,45 @@
+#include "tegra210.dtsi"
+
+/ {
+       model = "NVIDIA Jetson TX1";
+       compatible = "nvidia,p2180", "nvidia,tegra210";
+
+       aliases {
+               rtc1 = "/rtc@0,7000e000";
+               serial0 = &uarta;
+       };
+
+       memory {
+               device_type = "memory";
+               reg = <0x0 0x80000000 0x1 0x0>;
+       };
+
+       /* debug port */
+       serial@0,70006000 {
+               status = "okay";
+       };
+
+       pmc@0,7000e400 {
+               nvidia,invert-interrupt;
+       };
+
+       /* eMMC */
+       sdhci@0,700b0600 {
+               status = "okay";
+               bus-width = <8>;
+               non-removable;
+       };
+
+       clocks {
+               compatible = "simple-bus";
+               #address-cells = <1>;
+               #size-cells = <0>;
+
+               clk32k_in: clock@0 {
+                       compatible = "fixed-clock";
+                       reg = <0>;
+                       #clock-cells = <0>;
+                       clock-frequency = <32768>;
+               };
+       };
+};
diff --git a/arch/arm64/boot/dts/nvidia/tegra210-p2371-0000.dts b/arch/arm64/boot/dts/nvidia/tegra210-p2371-0000.dts
new file mode 100644 (file)
index 0000000..1ddd851
--- /dev/null
@@ -0,0 +1,9 @@
+/dts-v1/;
+
+#include "tegra210-p2530.dtsi"
+#include "tegra210-p2595.dtsi"
+
+/ {
+       model = "NVIDIA Tegra210 P2371 (P2530/P2595) reference design";
+       compatible = "nvidia,p2371-0000", "nvidia,tegra210";
+};
diff --git a/arch/arm64/boot/dts/nvidia/tegra210-p2371-2180.dts b/arch/arm64/boot/dts/nvidia/tegra210-p2371-2180.dts
new file mode 100644 (file)
index 0000000..683b339
--- /dev/null
@@ -0,0 +1,9 @@
+/dts-v1/;
+
+#include "tegra210-p2180.dtsi"
+#include "tegra210-p2597.dtsi"
+
+/ {
+       model = "NVIDIA Jetson TX1 Developer Kit";
+       compatible = "nvidia,p2371-2180", "nvidia,tegra210";
+};
diff --git a/arch/arm64/boot/dts/nvidia/tegra210-p2530.dtsi b/arch/arm64/boot/dts/nvidia/tegra210-p2530.dtsi
new file mode 100644 (file)
index 0000000..ece0dec
--- /dev/null
@@ -0,0 +1,50 @@
+#include "tegra210.dtsi"
+
+/ {
+       model = "NVIDIA Tegra210 P2530 main board";
+       compatible = "nvidia,p2530", "nvidia,tegra210";
+
+       aliases {
+               rtc1 = "/rtc@0,7000e000";
+               serial0 = &uarta;
+       };
+
+       memory {
+               device_type = "memory";
+               reg = <0x0 0x80000000 0x0 0xc0000000>;
+       };
+
+       /* debug port */
+       serial@0,70006000 {
+               status = "okay";
+       };
+
+       i2c@0,7000d000 {
+               status = "okay";
+               clock-frequency = <400000>;
+       };
+
+       pmc@0,7000e400 {
+               nvidia,invert-interrupt;
+       };
+
+       /* eMMC */
+       sdhci@0,700b0600 {
+               status = "okay";
+               bus-width = <8>;
+               non-removable;
+       };
+
+       clocks {
+               compatible = "simple-bus";
+               #address-cells = <1>;
+               #size-cells = <0>;
+
+               clk32k_in: clock@0 {
+                       compatible = "fixed-clock";
+                       reg = <0>;
+                       #clock-cells = <0>;
+                       clock-frequency = <32768>;
+               };
+       };
+};
diff --git a/arch/arm64/boot/dts/nvidia/tegra210-p2571.dts b/arch/arm64/boot/dts/nvidia/tegra210-p2571.dts
new file mode 100644 (file)
index 0000000..58d27dd
--- /dev/null
@@ -0,0 +1,1302 @@
+/dts-v1/;
+
+#include <dt-bindings/input/input.h>
+#include "tegra210-p2530.dtsi"
+
+/ {
+       model = "NVIDIA Tegra210 P2571 reference design";
+       compatible = "nvidia,p2571", "nvidia,tegra210";
+
+       pinmux: pinmux@0,700008d4 {
+               pinctrl-names = "boot";
+               pinctrl-0 = <&state_boot>;
+
+               state_boot: pinmux {
+                       pex_l0_rst_n_pa0 {
+                               nvidia,pins = "pex_l0_rst_n_pa0";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+                       };
+                       pex_l0_clkreq_n_pa1 {
+                               nvidia,pins = "pex_l0_clkreq_n_pa1";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+                       };
+                       pex_wake_n_pa2 {
+                               nvidia,pins = "pex_wake_n_pa2";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+                       };
+                       pex_l1_rst_n_pa3 {
+                               nvidia,pins = "pex_l1_rst_n_pa3";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+                       };
+                       pex_l1_clkreq_n_pa4 {
+                               nvidia,pins = "pex_l1_clkreq_n_pa4";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+                       };
+                       sata_led_active_pa5 {
+                               nvidia,pins = "sata_led_active_pa5";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pa6 {
+                               nvidia,pins = "pa6";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dap1_fs_pb0 {
+                               nvidia,pins = "dap1_fs_pb0";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dap1_din_pb1 {
+                               nvidia,pins = "dap1_din_pb1";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dap1_dout_pb2 {
+                               nvidia,pins = "dap1_dout_pb2";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dap1_sclk_pb3 {
+                               nvidia,pins = "dap1_sclk_pb3";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi2_mosi_pb4 {
+                               nvidia,pins = "spi2_mosi_pb4";
+                               nvidia,function = "rsvd2";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi2_miso_pb5 {
+                               nvidia,pins = "spi2_miso_pb5";
+                               nvidia,function = "rsvd2";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi2_sck_pb6 {
+                               nvidia,pins = "spi2_sck_pb6";
+                               nvidia,function = "rsvd2";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi2_cs0_pb7 {
+                               nvidia,pins = "spi2_cs0_pb7";
+                               nvidia,function = "rsvd2";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi1_mosi_pc0 {
+                               nvidia,pins = "spi1_mosi_pc0";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi1_miso_pc1 {
+                               nvidia,pins = "spi1_miso_pc1";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi1_sck_pc2 {
+                               nvidia,pins = "spi1_sck_pc2";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi1_cs0_pc3 {
+                               nvidia,pins = "spi1_cs0_pc3";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi1_cs1_pc4 {
+                               nvidia,pins = "spi1_cs1_pc4";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi4_sck_pc5 {
+                               nvidia,pins = "spi4_sck_pc5";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi4_cs0_pc6 {
+                               nvidia,pins = "spi4_cs0_pc6";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi4_mosi_pc7 {
+                               nvidia,pins = "spi4_mosi_pc7";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi4_miso_pd0 {
+                               nvidia,pins = "spi4_miso_pd0";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart3_tx_pd1 {
+                               nvidia,pins = "uart3_tx_pd1";
+                               nvidia,function = "rsvd2";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart3_rx_pd2 {
+                               nvidia,pins = "uart3_rx_pd2";
+                               nvidia,function = "rsvd2";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart3_rts_pd3 {
+                               nvidia,pins = "uart3_rts_pd3";
+                               nvidia,function = "rsvd2";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart3_cts_pd4 {
+                               nvidia,pins = "uart3_cts_pd4";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dmic1_clk_pe0 {
+                               nvidia,pins = "dmic1_clk_pe0";
+                               nvidia,function = "i2s3";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dmic1_dat_pe1 {
+                               nvidia,pins = "dmic1_dat_pe1";
+                               nvidia,function = "i2s3";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dmic2_clk_pe2 {
+                               nvidia,pins = "dmic2_clk_pe2";
+                               nvidia,function = "i2s3";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dmic2_dat_pe3 {
+                               nvidia,pins = "dmic2_dat_pe3";
+                               nvidia,function = "i2s3";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dmic3_clk_pe4 {
+                               nvidia,pins = "dmic3_clk_pe4";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dmic3_dat_pe5 {
+                               nvidia,pins = "dmic3_dat_pe5";
+                               nvidia,function = "rsvd2";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pe6 {
+                               nvidia,pins = "pe6";
+                               nvidia,function = "rsvd0";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pe7 {
+                               nvidia,pins = "pe7";
+                               nvidia,function = "pwm3";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       gen3_i2c_scl_pf0 {
+                               nvidia,pins = "gen3_i2c_scl_pf0";
+                               nvidia,function = "i2c3";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+                       };
+                       gen3_i2c_sda_pf1 {
+                               nvidia,pins = "gen3_i2c_sda_pf1";
+                               nvidia,function = "i2c3";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart2_tx_pg0 {
+                               nvidia,pins = "uart2_tx_pg0";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart2_rx_pg1 {
+                               nvidia,pins = "uart2_rx_pg1";
+                               nvidia,function = "uartb";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart2_rts_pg2 {
+                               nvidia,pins = "uart2_rts_pg2";
+                               nvidia,function = "rsvd2";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart2_cts_pg3 {
+                               nvidia,pins = "uart2_cts_pg3";
+                               nvidia,function = "rsvd2";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       wifi_en_ph0 {
+                               nvidia,pins = "wifi_en_ph0";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       wifi_rst_ph1 {
+                               nvidia,pins = "wifi_rst_ph1";
+                               nvidia,function = "rsvd0";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       wifi_wake_ap_ph2 {
+                               nvidia,pins = "wifi_wake_ap_ph2";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       ap_wake_bt_ph3 {
+                               nvidia,pins = "ap_wake_bt_ph3";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       bt_rst_ph4 {
+                               nvidia,pins = "bt_rst_ph4";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       bt_wake_ap_ph5 {
+                               nvidia,pins = "bt_wake_ap_ph5";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       ph6 {
+                               nvidia,pins = "ph6";
+                               nvidia,function = "rsvd0";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       ap_wake_nfc_ph7 {
+                               nvidia,pins = "ap_wake_nfc_ph7";
+                               nvidia,function = "rsvd0";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       nfc_en_pi0 {
+                               nvidia,pins = "nfc_en_pi0";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       nfc_int_pi1 {
+                               nvidia,pins = "nfc_int_pi1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       gps_en_pi2 {
+                               nvidia,pins = "gps_en_pi2";
+                               nvidia,function = "rsvd0";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       gps_rst_pi3 {
+                               nvidia,pins = "gps_rst_pi3";
+                               nvidia,function = "rsvd0";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart4_tx_pi4 {
+                               nvidia,pins = "uart4_tx_pi4";
+                               nvidia,function = "uartd";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart4_rx_pi5 {
+                               nvidia,pins = "uart4_rx_pi5";
+                               nvidia,function = "uartd";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart4_rts_pi6 {
+                               nvidia,pins = "uart4_rts_pi6";
+                               nvidia,function = "uartd";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart4_cts_pi7 {
+                               nvidia,pins = "uart4_cts_pi7";
+                               nvidia,function = "uartd";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       gen1_i2c_sda_pj0 {
+                               nvidia,pins = "gen1_i2c_sda_pj0";
+                               nvidia,function = "i2c1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+                       };
+                       gen1_i2c_scl_pj1 {
+                               nvidia,pins = "gen1_i2c_scl_pj1";
+                               nvidia,function = "i2c1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+                       };
+                       gen2_i2c_scl_pj2 {
+                               nvidia,pins = "gen2_i2c_scl_pj2";
+                               nvidia,function = "i2c2";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+                       };
+                       gen2_i2c_sda_pj3 {
+                               nvidia,pins = "gen2_i2c_sda_pj3";
+                               nvidia,function = "i2c2";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+                       };
+                       dap4_fs_pj4 {
+                               nvidia,pins = "dap4_fs_pj4";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dap4_din_pj5 {
+                               nvidia,pins = "dap4_din_pj5";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dap4_dout_pj6 {
+                               nvidia,pins = "dap4_dout_pj6";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dap4_sclk_pj7 {
+                               nvidia,pins = "dap4_sclk_pj7";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pk0 {
+                               nvidia,pins = "pk0";
+                               nvidia,function = "rsvd2";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pk1 {
+                               nvidia,pins = "pk1";
+                               nvidia,function = "rsvd2";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pk2 {
+                               nvidia,pins = "pk2";
+                               nvidia,function = "rsvd2";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pk3 {
+                               nvidia,pins = "pk3";
+                               nvidia,function = "rsvd2";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pk4 {
+                               nvidia,pins = "pk4";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pk5 {
+                               nvidia,pins = "pk5";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pk6 {
+                               nvidia,pins = "pk6";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pk7 {
+                               nvidia,pins = "pk7";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pl0 {
+                               nvidia,pins = "pl0";
+                               nvidia,function = "rsvd0";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pl1 {
+                               nvidia,pins = "pl1";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       sdmmc1_clk_pm0 {
+                               nvidia,pins = "sdmmc1_clk_pm0";
+                               nvidia,function = "sdmmc1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       sdmmc1_cmd_pm1 {
+                               nvidia,pins = "sdmmc1_cmd_pm1";
+                               nvidia,function = "sdmmc1";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       sdmmc1_dat3_pm2 {
+                               nvidia,pins = "sdmmc1_dat3_pm2";
+                               nvidia,function = "sdmmc1";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       sdmmc1_dat2_pm3 {
+                               nvidia,pins = "sdmmc1_dat2_pm3";
+                               nvidia,function = "sdmmc1";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       sdmmc1_dat1_pm4 {
+                               nvidia,pins = "sdmmc1_dat1_pm4";
+                               nvidia,function = "sdmmc1";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       sdmmc1_dat0_pm5 {
+                               nvidia,pins = "sdmmc1_dat0_pm5";
+                               nvidia,function = "sdmmc1";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       sdmmc3_clk_pp0 {
+                               nvidia,pins = "sdmmc3_clk_pp0";
+                               nvidia,function = "sdmmc3";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       sdmmc3_cmd_pp1 {
+                               nvidia,pins = "sdmmc3_cmd_pp1";
+                               nvidia,function = "sdmmc3";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       sdmmc3_dat3_pp2 {
+                               nvidia,pins = "sdmmc3_dat3_pp2";
+                               nvidia,function = "sdmmc3";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       sdmmc3_dat2_pp3 {
+                               nvidia,pins = "sdmmc3_dat2_pp3";
+                               nvidia,function = "sdmmc3";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       sdmmc3_dat1_pp4 {
+                               nvidia,pins = "sdmmc3_dat1_pp4";
+                               nvidia,function = "sdmmc3";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       sdmmc3_dat0_pp5 {
+                               nvidia,pins = "sdmmc3_dat0_pp5";
+                               nvidia,function = "sdmmc3";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       cam1_mclk_ps0 {
+                               nvidia,pins = "cam1_mclk_ps0";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       cam2_mclk_ps1 {
+                               nvidia,pins = "cam2_mclk_ps1";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       cam_i2c_scl_ps2 {
+                               nvidia,pins = "cam_i2c_scl_ps2";
+                               nvidia,function = "i2cvi";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+                       };
+                       cam_i2c_sda_ps3 {
+                               nvidia,pins = "cam_i2c_sda_ps3";
+                               nvidia,function = "i2cvi";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+                       };
+                       cam_rst_ps4 {
+                               nvidia,pins = "cam_rst_ps4";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       cam_af_en_ps5 {
+                               nvidia,pins = "cam_af_en_ps5";
+                               nvidia,function = "rsvd2";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       cam_flash_en_ps6 {
+                               nvidia,pins = "cam_flash_en_ps6";
+                               nvidia,function = "rsvd2";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       cam1_pwdn_ps7 {
+                               nvidia,pins = "cam1_pwdn_ps7";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       cam2_pwdn_pt0 {
+                               nvidia,pins = "cam2_pwdn_pt0";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       cam1_strobe_pt1 {
+                               nvidia,pins = "cam1_strobe_pt1";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart1_tx_pu0 {
+                               nvidia,pins = "uart1_tx_pu0";
+                               nvidia,function = "uarta";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart1_rx_pu1 {
+                               nvidia,pins = "uart1_rx_pu1";
+                               nvidia,function = "uarta";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart1_rts_pu2 {
+                               nvidia,pins = "uart1_rts_pu2";
+                               nvidia,function = "uarta";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart1_cts_pu3 {
+                               nvidia,pins = "uart1_cts_pu3";
+                               nvidia,function = "uarta";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       lcd_bl_pwm_pv0 {
+                               nvidia,pins = "lcd_bl_pwm_pv0";
+                               nvidia,function = "pwm0";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       lcd_bl_en_pv1 {
+                               nvidia,pins = "lcd_bl_en_pv1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       lcd_rst_pv2 {
+                               nvidia,pins = "lcd_rst_pv2";
+                               nvidia,function = "rsvd0";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       lcd_gpio1_pv3 {
+                               nvidia,pins = "lcd_gpio1_pv3";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       lcd_gpio2_pv4 {
+                               nvidia,pins = "lcd_gpio2_pv4";
+                               nvidia,function = "pwm1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       ap_ready_pv5 {
+                               nvidia,pins = "ap_ready_pv5";
+                               nvidia,function = "rsvd0";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       touch_rst_pv6 {
+                               nvidia,pins = "touch_rst_pv6";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       touch_clk_pv7 {
+                               nvidia,pins = "touch_clk_pv7";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       modem_wake_ap_px0 {
+                               nvidia,pins = "modem_wake_ap_px0";
+                               nvidia,function = "rsvd0";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       touch_int_px1 {
+                               nvidia,pins = "touch_int_px1";
+                               nvidia,function = "rsvd0";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       motion_int_px2 {
+                               nvidia,pins = "motion_int_px2";
+                               nvidia,function = "rsvd0";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       als_prox_int_px3 {
+                               nvidia,pins = "als_prox_int_px3";
+                               nvidia,function = "rsvd0";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       temp_alert_px4 {
+                               nvidia,pins = "temp_alert_px4";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       button_power_on_px5 {
+                               nvidia,pins = "button_power_on_px5";
+                               nvidia,function = "rsvd0";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       button_vol_up_px6 {
+                               nvidia,pins = "button_vol_up_px6";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       button_vol_down_px7 {
+                               nvidia,pins = "button_vol_down_px7";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       button_slide_sw_py0 {
+                               nvidia,pins = "button_slide_sw_py0";
+                               nvidia,function = "rsvd0";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       button_home_py1 {
+                               nvidia,pins = "button_home_py1";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       lcd_te_py2 {
+                               nvidia,pins = "lcd_te_py2";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pwr_i2c_scl_py3 {
+                               nvidia,pins = "pwr_i2c_scl_py3";
+                               nvidia,function = "i2cpmu";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+                       };
+                       pwr_i2c_sda_py4 {
+                               nvidia,pins = "pwr_i2c_sda_py4";
+                               nvidia,function = "i2cpmu";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+                       };
+                       clk_32k_out_py5 {
+                               nvidia,pins = "clk_32k_out_py5";
+                               nvidia,function = "soc";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pz0 {
+                               nvidia,pins = "pz0";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pz1 {
+                               nvidia,pins = "pz1";
+                               nvidia,function = "sdmmc1";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pz2 {
+                               nvidia,pins = "pz2";
+                               nvidia,function = "rsvd2";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pz3 {
+                               nvidia,pins = "pz3";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pz4 {
+                               nvidia,pins = "pz4";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pz5 {
+                               nvidia,pins = "pz5";
+                               nvidia,function = "soc";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dap2_fs_paa0 {
+                               nvidia,pins = "dap2_fs_paa0";
+                               nvidia,function = "i2s2";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dap2_sclk_paa1 {
+                               nvidia,pins = "dap2_sclk_paa1";
+                               nvidia,function = "i2s2";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dap2_din_paa2 {
+                               nvidia,pins = "dap2_din_paa2";
+                               nvidia,function = "i2s2";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dap2_dout_paa3 {
+                               nvidia,pins = "dap2_dout_paa3";
+                               nvidia,function = "i2s2";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       aud_mclk_pbb0 {
+                               nvidia,pins = "aud_mclk_pbb0";
+                               nvidia,function = "aud";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dvfs_pwm_pbb1 {
+                               nvidia,pins = "dvfs_pwm_pbb1";
+                               nvidia,function = "cldvfs";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dvfs_clk_pbb2 {
+                               nvidia,pins = "dvfs_clk_pbb2";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       gpio_x1_aud_pbb3 {
+                               nvidia,pins = "gpio_x1_aud_pbb3";
+                               nvidia,function = "rsvd0";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       gpio_x3_aud_pbb4 {
+                               nvidia,pins = "gpio_x3_aud_pbb4";
+                               nvidia,function = "rsvd0";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       hdmi_cec_pcc0 {
+                               nvidia,pins = "hdmi_cec_pcc0";
+                               nvidia,function = "cec";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+                       };
+                       hdmi_int_dp_hpd_pcc1 {
+                               nvidia,pins = "hdmi_int_dp_hpd_pcc1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+                       };
+                       spdif_out_pcc2 {
+                               nvidia,pins = "spdif_out_pcc2";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spdif_in_pcc3 {
+                               nvidia,pins = "spdif_in_pcc3";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       usb_vbus_en0_pcc4 {
+                               nvidia,pins = "usb_vbus_en0_pcc4";
+                               nvidia,function = "usb";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+                       };
+                       usb_vbus_en1_pcc5 {
+                               nvidia,pins = "usb_vbus_en1_pcc5";
+                               nvidia,function = "usb";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+                       };
+                       dp_hpd0_pcc6 {
+                               nvidia,pins = "dp_hpd0_pcc6";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pcc7 {
+                               nvidia,pins = "pcc7";
+                               nvidia,function = "rsvd0";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi2_cs1_pdd0 {
+                               nvidia,pins = "spi2_cs1_pdd0";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       qspi_sck_pee0 {
+                               nvidia,pins = "qspi_sck_pee0";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       qspi_cs_n_pee1 {
+                               nvidia,pins = "qspi_cs_n_pee1";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       qspi_io0_pee2 {
+                               nvidia,pins = "qspi_io0_pee2";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       qspi_io1_pee3 {
+                               nvidia,pins = "qspi_io1_pee3";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       qspi_io2_pee4 {
+                               nvidia,pins = "qspi_io2_pee4";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       qspi_io3_pee5 {
+                               nvidia,pins = "qspi_io3_pee5";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       core_pwr_req {
+                               nvidia,pins = "core_pwr_req";
+                               nvidia,function = "core";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       cpu_pwr_req {
+                               nvidia,pins = "cpu_pwr_req";
+                               nvidia,function = "cpu";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pwr_int_n {
+                               nvidia,pins = "pwr_int_n";
+                               nvidia,function = "pmi";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       clk_32k_in {
+                               nvidia,pins = "clk_32k_in";
+                               nvidia,function = "clk";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       jtag_rtck {
+                               nvidia,pins = "jtag_rtck";
+                               nvidia,function = "jtag";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       clk_req {
+                               nvidia,pins = "clk_req";
+                               nvidia,function = "sys";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       shutdown {
+                               nvidia,pins = "shutdown";
+                               nvidia,function = "shutdown";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+               };
+       };
+};
diff --git a/arch/arm64/boot/dts/nvidia/tegra210-p2595.dtsi b/arch/arm64/boot/dts/nvidia/tegra210-p2595.dtsi
new file mode 100644 (file)
index 0000000..f3f9139
--- /dev/null
@@ -0,0 +1,1272 @@
+/ {
+       model = "NVIDIA Tegra210 P2595 I/O board";
+       compatible = "nvidia,p2595", "nvidia,tegra210";
+
+       pinmux: pinmux@0,700008d4 {
+               pinctrl-names = "boot";
+               pinctrl-0 = <&state_boot>;
+
+               state_boot: pinmux {
+                       pex_l0_rst_n_pa0 {
+                               nvidia,pins = "pex_l0_rst_n_pa0";
+                               nvidia,function = "pe0";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+                       };
+                       pex_l0_clkreq_n_pa1 {
+                               nvidia,pins = "pex_l0_clkreq_n_pa1";
+                               nvidia,function = "pe0";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+                       };
+                       pex_wake_n_pa2 {
+                               nvidia,pins = "pex_wake_n_pa2";
+                               nvidia,function = "pe";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+                       };
+                       pex_l1_rst_n_pa3 {
+                               nvidia,pins = "pex_l1_rst_n_pa3";
+                               nvidia,function = "pe1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+                       };
+                       pex_l1_clkreq_n_pa4 {
+                               nvidia,pins = "pex_l1_clkreq_n_pa4";
+                               nvidia,function = "pe1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+                       };
+                       sata_led_active_pa5 {
+                               nvidia,pins = "sata_led_active_pa5";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pa6 {
+                               nvidia,pins = "pa6";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dap1_fs_pb0 {
+                               nvidia,pins = "dap1_fs_pb0";
+                               nvidia,function = "i2s1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dap1_din_pb1 {
+                               nvidia,pins = "dap1_din_pb1";
+                               nvidia,function = "i2s1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dap1_dout_pb2 {
+                               nvidia,pins = "dap1_dout_pb2";
+                               nvidia,function = "i2s1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dap1_sclk_pb3 {
+                               nvidia,pins = "dap1_sclk_pb3";
+                               nvidia,function = "i2s1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi2_mosi_pb4 {
+                               nvidia,pins = "spi2_mosi_pb4";
+                               nvidia,function = "rsvd2";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi2_miso_pb5 {
+                               nvidia,pins = "spi2_miso_pb5";
+                               nvidia,function = "rsvd2";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi2_sck_pb6 {
+                               nvidia,pins = "spi2_sck_pb6";
+                               nvidia,function = "rsvd2";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi2_cs0_pb7 {
+                               nvidia,pins = "spi2_cs0_pb7";
+                               nvidia,function = "rsvd2";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi1_mosi_pc0 {
+                               nvidia,pins = "spi1_mosi_pc0";
+                               nvidia,function = "spi1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi1_miso_pc1 {
+                               nvidia,pins = "spi1_miso_pc1";
+                               nvidia,function = "spi1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi1_sck_pc2 {
+                               nvidia,pins = "spi1_sck_pc2";
+                               nvidia,function = "spi1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi1_cs0_pc3 {
+                               nvidia,pins = "spi1_cs0_pc3";
+                               nvidia,function = "spi1";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi1_cs1_pc4 {
+                               nvidia,pins = "spi1_cs1_pc4";
+                               nvidia,function = "spi1";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi4_sck_pc5 {
+                               nvidia,pins = "spi4_sck_pc5";
+                               nvidia,function = "spi4";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi4_cs0_pc6 {
+                               nvidia,pins = "spi4_cs0_pc6";
+                               nvidia,function = "spi4";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi4_mosi_pc7 {
+                               nvidia,pins = "spi4_mosi_pc7";
+                               nvidia,function = "spi4";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi4_miso_pd0 {
+                               nvidia,pins = "spi4_miso_pd0";
+                               nvidia,function = "spi4";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart3_tx_pd1 {
+                               nvidia,pins = "uart3_tx_pd1";
+                               nvidia,function = "uartc";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart3_rx_pd2 {
+                               nvidia,pins = "uart3_rx_pd2";
+                               nvidia,function = "uartc";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart3_rts_pd3 {
+                               nvidia,pins = "uart3_rts_pd3";
+                               nvidia,function = "uartc";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart3_cts_pd4 {
+                               nvidia,pins = "uart3_cts_pd4";
+                               nvidia,function = "uartc";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dmic1_clk_pe0 {
+                               nvidia,pins = "dmic1_clk_pe0";
+                               nvidia,function = "dmic1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dmic1_dat_pe1 {
+                               nvidia,pins = "dmic1_dat_pe1";
+                               nvidia,function = "dmic1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dmic2_clk_pe2 {
+                               nvidia,pins = "dmic2_clk_pe2";
+                               nvidia,function = "dmic2";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dmic2_dat_pe3 {
+                               nvidia,pins = "dmic2_dat_pe3";
+                               nvidia,function = "dmic2";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dmic3_clk_pe4 {
+                               nvidia,pins = "dmic3_clk_pe4";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dmic3_dat_pe5 {
+                               nvidia,pins = "dmic3_dat_pe5";
+                               nvidia,function = "rsvd2";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pe6 {
+                               nvidia,pins = "pe6";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pe7 {
+                               nvidia,pins = "pe7";
+                               nvidia,function = "pwm3";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       gen3_i2c_scl_pf0 {
+                               nvidia,pins = "gen3_i2c_scl_pf0";
+                               nvidia,function = "i2c3";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+                       };
+                       gen3_i2c_sda_pf1 {
+                               nvidia,pins = "gen3_i2c_sda_pf1";
+                               nvidia,function = "i2c3";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart2_tx_pg0 {
+                               nvidia,pins = "uart2_tx_pg0";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart2_rx_pg1 {
+                               nvidia,pins = "uart2_rx_pg1";
+                               nvidia,function = "uartb";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart2_rts_pg2 {
+                               nvidia,pins = "uart2_rts_pg2";
+                               nvidia,function = "rsvd2";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart2_cts_pg3 {
+                               nvidia,pins = "uart2_cts_pg3";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       wifi_en_ph0 {
+                               nvidia,pins = "wifi_en_ph0";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       wifi_rst_ph1 {
+                               nvidia,pins = "wifi_rst_ph1";
+                               nvidia,function = "rsvd0";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       wifi_wake_ap_ph2 {
+                               nvidia,pins = "wifi_wake_ap_ph2";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       ap_wake_bt_ph3 {
+                               nvidia,pins = "ap_wake_bt_ph3";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       bt_rst_ph4 {
+                               nvidia,pins = "bt_rst_ph4";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       bt_wake_ap_ph5 {
+                               nvidia,pins = "bt_wake_ap_ph5";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       ph6 {
+                               nvidia,pins = "ph6";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       ap_wake_nfc_ph7 {
+                               nvidia,pins = "ap_wake_nfc_ph7";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       nfc_en_pi0 {
+                               nvidia,pins = "nfc_en_pi0";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       nfc_int_pi1 {
+                               nvidia,pins = "nfc_int_pi1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       gps_en_pi2 {
+                               nvidia,pins = "gps_en_pi2";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       gps_rst_pi3 {
+                               nvidia,pins = "gps_rst_pi3";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart4_tx_pi4 {
+                               nvidia,pins = "uart4_tx_pi4";
+                               nvidia,function = "uartd";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart4_rx_pi5 {
+                               nvidia,pins = "uart4_rx_pi5";
+                               nvidia,function = "uartd";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart4_rts_pi6 {
+                               nvidia,pins = "uart4_rts_pi6";
+                               nvidia,function = "uartd";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart4_cts_pi7 {
+                               nvidia,pins = "uart4_cts_pi7";
+                               nvidia,function = "uartd";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       gen1_i2c_sda_pj0 {
+                               nvidia,pins = "gen1_i2c_sda_pj0";
+                               nvidia,function = "i2c1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+                       };
+                       gen1_i2c_scl_pj1 {
+                               nvidia,pins = "gen1_i2c_scl_pj1";
+                               nvidia,function = "i2c1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+                       };
+                       gen2_i2c_scl_pj2 {
+                               nvidia,pins = "gen2_i2c_scl_pj2";
+                               nvidia,function = "i2c2";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+                       };
+                       gen2_i2c_sda_pj3 {
+                               nvidia,pins = "gen2_i2c_sda_pj3";
+                               nvidia,function = "i2c2";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+                       };
+                       dap4_fs_pj4 {
+                               nvidia,pins = "dap4_fs_pj4";
+                               nvidia,function = "i2s4b";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dap4_din_pj5 {
+                               nvidia,pins = "dap4_din_pj5";
+                               nvidia,function = "i2s4b";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dap4_dout_pj6 {
+                               nvidia,pins = "dap4_dout_pj6";
+                               nvidia,function = "i2s4b";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dap4_sclk_pj7 {
+                               nvidia,pins = "dap4_sclk_pj7";
+                               nvidia,function = "i2s4b";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pk0 {
+                               nvidia,pins = "pk0";
+                               nvidia,function = "i2s5b";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pk1 {
+                               nvidia,pins = "pk1";
+                               nvidia,function = "i2s5b";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pk2 {
+                               nvidia,pins = "pk2";
+                               nvidia,function = "i2s5b";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pk3 {
+                               nvidia,pins = "pk3";
+                               nvidia,function = "i2s5b";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pk4 {
+                               nvidia,pins = "pk4";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pk5 {
+                               nvidia,pins = "pk5";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pk6 {
+                               nvidia,pins = "pk6";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pk7 {
+                               nvidia,pins = "pk7";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pl0 {
+                               nvidia,pins = "pl0";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pl1 {
+                               nvidia,pins = "pl1";
+                               nvidia,function = "soc";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       sdmmc1_clk_pm0 {
+                               nvidia,pins = "sdmmc1_clk_pm0";
+                               nvidia,function = "sdmmc1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       sdmmc1_cmd_pm1 {
+                               nvidia,pins = "sdmmc1_cmd_pm1";
+                               nvidia,function = "sdmmc1";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       sdmmc1_dat3_pm2 {
+                               nvidia,pins = "sdmmc1_dat3_pm2";
+                               nvidia,function = "sdmmc1";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       sdmmc1_dat2_pm3 {
+                               nvidia,pins = "sdmmc1_dat2_pm3";
+                               nvidia,function = "sdmmc1";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       sdmmc1_dat1_pm4 {
+                               nvidia,pins = "sdmmc1_dat1_pm4";
+                               nvidia,function = "sdmmc1";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       sdmmc1_dat0_pm5 {
+                               nvidia,pins = "sdmmc1_dat0_pm5";
+                               nvidia,function = "sdmmc1";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       sdmmc3_clk_pp0 {
+                               nvidia,pins = "sdmmc3_clk_pp0";
+                               nvidia,function = "sdmmc3";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       sdmmc3_cmd_pp1 {
+                               nvidia,pins = "sdmmc3_cmd_pp1";
+                               nvidia,function = "sdmmc3";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       sdmmc3_dat3_pp2 {
+                               nvidia,pins = "sdmmc3_dat3_pp2";
+                               nvidia,function = "sdmmc3";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       sdmmc3_dat2_pp3 {
+                               nvidia,pins = "sdmmc3_dat2_pp3";
+                               nvidia,function = "sdmmc3";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       sdmmc3_dat1_pp4 {
+                               nvidia,pins = "sdmmc3_dat1_pp4";
+                               nvidia,function = "sdmmc3";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       sdmmc3_dat0_pp5 {
+                               nvidia,pins = "sdmmc3_dat0_pp5";
+                               nvidia,function = "sdmmc3";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       cam1_mclk_ps0 {
+                               nvidia,pins = "cam1_mclk_ps0";
+                               nvidia,function = "extperiph3";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       cam2_mclk_ps1 {
+                               nvidia,pins = "cam2_mclk_ps1";
+                               nvidia,function = "extperiph3";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       cam_i2c_scl_ps2 {
+                               nvidia,pins = "cam_i2c_scl_ps2";
+                               nvidia,function = "i2cvi";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+                       };
+                       cam_i2c_sda_ps3 {
+                               nvidia,pins = "cam_i2c_sda_ps3";
+                               nvidia,function = "i2cvi";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+                       };
+                       cam_rst_ps4 {
+                               nvidia,pins = "cam_rst_ps4";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       cam_af_en_ps5 {
+                               nvidia,pins = "cam_af_en_ps5";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       cam_flash_en_ps6 {
+                               nvidia,pins = "cam_flash_en_ps6";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       cam1_pwdn_ps7 {
+                               nvidia,pins = "cam1_pwdn_ps7";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       cam2_pwdn_pt0 {
+                               nvidia,pins = "cam2_pwdn_pt0";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       cam1_strobe_pt1 {
+                               nvidia,pins = "cam1_strobe_pt1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart1_tx_pu0 {
+                               nvidia,pins = "uart1_tx_pu0";
+                               nvidia,function = "uarta";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart1_rx_pu1 {
+                               nvidia,pins = "uart1_rx_pu1";
+                               nvidia,function = "uarta";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart1_rts_pu2 {
+                               nvidia,pins = "uart1_rts_pu2";
+                               nvidia,function = "uarta";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart1_cts_pu3 {
+                               nvidia,pins = "uart1_cts_pu3";
+                               nvidia,function = "uarta";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       lcd_bl_pwm_pv0 {
+                               nvidia,pins = "lcd_bl_pwm_pv0";
+                               nvidia,function = "pwm0";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       lcd_bl_en_pv1 {
+                               nvidia,pins = "lcd_bl_en_pv1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       lcd_rst_pv2 {
+                               nvidia,pins = "lcd_rst_pv2";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       lcd_gpio1_pv3 {
+                               nvidia,pins = "lcd_gpio1_pv3";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       lcd_gpio2_pv4 {
+                               nvidia,pins = "lcd_gpio2_pv4";
+                               nvidia,function = "pwm1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       ap_ready_pv5 {
+                               nvidia,pins = "ap_ready_pv5";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       touch_rst_pv6 {
+                               nvidia,pins = "touch_rst_pv6";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       touch_clk_pv7 {
+                               nvidia,pins = "touch_clk_pv7";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       modem_wake_ap_px0 {
+                               nvidia,pins = "modem_wake_ap_px0";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       touch_int_px1 {
+                               nvidia,pins = "touch_int_px1";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       motion_int_px2 {
+                               nvidia,pins = "motion_int_px2";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       als_prox_int_px3 {
+                               nvidia,pins = "als_prox_int_px3";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       temp_alert_px4 {
+                               nvidia,pins = "temp_alert_px4";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       button_power_on_px5 {
+                               nvidia,pins = "button_power_on_px5";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       button_vol_up_px6 {
+                               nvidia,pins = "button_vol_up_px6";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       button_vol_down_px7 {
+                               nvidia,pins = "button_vol_down_px7";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       button_slide_sw_py0 {
+                               nvidia,pins = "button_slide_sw_py0";
+                               nvidia,function = "rsvd0";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       button_home_py1 {
+                               nvidia,pins = "button_home_py1";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       lcd_te_py2 {
+                               nvidia,pins = "lcd_te_py2";
+                               nvidia,function = "displaya";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pwr_i2c_scl_py3 {
+                               nvidia,pins = "pwr_i2c_scl_py3";
+                               nvidia,function = "i2cpmu";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+                       };
+                       pwr_i2c_sda_py4 {
+                               nvidia,pins = "pwr_i2c_sda_py4";
+                               nvidia,function = "i2cpmu";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+                       };
+                       clk_32k_out_py5 {
+                               nvidia,pins = "clk_32k_out_py5";
+                               nvidia,function = "soc";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pz0 {
+                               nvidia,pins = "pz0";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pz1 {
+                               nvidia,pins = "pz1";
+                               nvidia,function = "sdmmc1";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pz2 {
+                               nvidia,pins = "pz2";
+                               nvidia,function = "rsvd2";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pz3 {
+                               nvidia,pins = "pz3";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pz4 {
+                               nvidia,pins = "pz4";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pz5 {
+                               nvidia,pins = "pz5";
+                               nvidia,function = "soc";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dap2_fs_paa0 {
+                               nvidia,pins = "dap2_fs_paa0";
+                               nvidia,function = "i2s2";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dap2_sclk_paa1 {
+                               nvidia,pins = "dap2_sclk_paa1";
+                               nvidia,function = "i2s2";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dap2_din_paa2 {
+                               nvidia,pins = "dap2_din_paa2";
+                               nvidia,function = "i2s2";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dap2_dout_paa3 {
+                               nvidia,pins = "dap2_dout_paa3";
+                               nvidia,function = "i2s2";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       aud_mclk_pbb0 {
+                               nvidia,pins = "aud_mclk_pbb0";
+                               nvidia,function = "aud";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dvfs_pwm_pbb1 {
+                               nvidia,pins = "dvfs_pwm_pbb1";
+                               nvidia,function = "cldvfs";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dvfs_clk_pbb2 {
+                               nvidia,pins = "dvfs_clk_pbb2";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       gpio_x1_aud_pbb3 {
+                               nvidia,pins = "gpio_x1_aud_pbb3";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       gpio_x3_aud_pbb4 {
+                               nvidia,pins = "gpio_x3_aud_pbb4";
+                               nvidia,function = "rsvd0";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       hdmi_cec_pcc0 {
+                               nvidia,pins = "hdmi_cec_pcc0";
+                               nvidia,function = "cec";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+                       };
+                       hdmi_int_dp_hpd_pcc1 {
+                               nvidia,pins = "hdmi_int_dp_hpd_pcc1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+                       };
+                       spdif_out_pcc2 {
+                               nvidia,pins = "spdif_out_pcc2";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spdif_in_pcc3 {
+                               nvidia,pins = "spdif_in_pcc3";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       usb_vbus_en0_pcc4 {
+                               nvidia,pins = "usb_vbus_en0_pcc4";
+                               nvidia,function = "usb";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+                       };
+                       usb_vbus_en1_pcc5 {
+                               nvidia,pins = "usb_vbus_en1_pcc5";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+                       };
+                       dp_hpd0_pcc6 {
+                               nvidia,pins = "dp_hpd0_pcc6";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pcc7 {
+                               nvidia,pins = "pcc7";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi2_cs1_pdd0 {
+                               nvidia,pins = "spi2_cs1_pdd0";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       qspi_sck_pee0 {
+                               nvidia,pins = "qspi_sck_pee0";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       qspi_cs_n_pee1 {
+                               nvidia,pins = "qspi_cs_n_pee1";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       qspi_io0_pee2 {
+                               nvidia,pins = "qspi_io0_pee2";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       qspi_io1_pee3 {
+                               nvidia,pins = "qspi_io1_pee3";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       qspi_io2_pee4 {
+                               nvidia,pins = "qspi_io2_pee4";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       qspi_io3_pee5 {
+                               nvidia,pins = "qspi_io3_pee5";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       core_pwr_req {
+                               nvidia,pins = "core_pwr_req";
+                               nvidia,function = "core";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       cpu_pwr_req {
+                               nvidia,pins = "cpu_pwr_req";
+                               nvidia,function = "cpu";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pwr_int_n {
+                               nvidia,pins = "pwr_int_n";
+                               nvidia,function = "pmi";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       clk_32k_in {
+                               nvidia,pins = "clk_32k_in";
+                               nvidia,function = "clk";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       jtag_rtck {
+                               nvidia,pins = "jtag_rtck";
+                               nvidia,function = "jtag";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       clk_req {
+                               nvidia,pins = "clk_req";
+                               nvidia,function = "sys";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       shutdown {
+                               nvidia,pins = "shutdown";
+                               nvidia,function = "shutdown";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+               };
+       };
+};
diff --git a/arch/arm64/boot/dts/nvidia/tegra210-p2597.dtsi b/arch/arm64/boot/dts/nvidia/tegra210-p2597.dtsi
new file mode 100644 (file)
index 0000000..be3eccb
--- /dev/null
@@ -0,0 +1,1270 @@
+/ {
+       model = "NVIDIA Tegra210 P2597 I/O board";
+       compatible = "nvidia,p2597", "nvidia,tegra210";
+
+       pinmux: pinmux@0,700008d4 {
+               pinctrl-names = "boot";
+               pinctrl-0 = <&state_boot>;
+
+               state_boot: pinmux {
+                       pex_l0_rst_n_pa0 {
+                               nvidia,pins = "pex_l0_rst_n_pa0";
+                               nvidia,function = "pe0";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+                       };
+                       pex_l0_clkreq_n_pa1 {
+                               nvidia,pins = "pex_l0_clkreq_n_pa1";
+                               nvidia,function = "pe0";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+                       };
+                       pex_wake_n_pa2 {
+                               nvidia,pins = "pex_wake_n_pa2";
+                               nvidia,function = "pe";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+                       };
+                       pex_l1_rst_n_pa3 {
+                               nvidia,pins = "pex_l1_rst_n_pa3";
+                               nvidia,function = "pe1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+                       };
+                       pex_l1_clkreq_n_pa4 {
+                               nvidia,pins = "pex_l1_clkreq_n_pa4";
+                               nvidia,function = "pe1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+                       };
+                       sata_led_active_pa5 {
+                               nvidia,pins = "sata_led_active_pa5";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pa6 {
+                               nvidia,pins = "pa6";
+                               nvidia,function = "sata";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dap1_fs_pb0 {
+                               nvidia,pins = "dap1_fs_pb0";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dap1_din_pb1 {
+                               nvidia,pins = "dap1_din_pb1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dap1_dout_pb2 {
+                               nvidia,pins = "dap1_dout_pb2";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dap1_sclk_pb3 {
+                               nvidia,pins = "dap1_sclk_pb3";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi2_mosi_pb4 {
+                               nvidia,pins = "spi2_mosi_pb4";
+                               nvidia,function = "spi2";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi2_miso_pb5 {
+                               nvidia,pins = "spi2_miso_pb5";
+                               nvidia,function = "spi2";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi2_sck_pb6 {
+                               nvidia,pins = "spi2_sck_pb6";
+                               nvidia,function = "spi2";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi2_cs0_pb7 {
+                               nvidia,pins = "spi2_cs0_pb7";
+                               nvidia,function = "spi2";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi1_mosi_pc0 {
+                               nvidia,pins = "spi1_mosi_pc0";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi1_miso_pc1 {
+                               nvidia,pins = "spi1_miso_pc1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi1_sck_pc2 {
+                               nvidia,pins = "spi1_sck_pc2";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi1_cs0_pc3 {
+                               nvidia,pins = "spi1_cs0_pc3";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi1_cs1_pc4 {
+                               nvidia,pins = "spi1_cs1_pc4";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi4_sck_pc5 {
+                               nvidia,pins = "spi4_sck_pc5";
+                               nvidia,function = "spi4";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi4_cs0_pc6 {
+                               nvidia,pins = "spi4_cs0_pc6";
+                               nvidia,function = "spi4";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi4_mosi_pc7 {
+                               nvidia,pins = "spi4_mosi_pc7";
+                               nvidia,function = "spi4";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi4_miso_pd0 {
+                               nvidia,pins = "spi4_miso_pd0";
+                               nvidia,function = "spi4";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart3_tx_pd1 {
+                               nvidia,pins = "uart3_tx_pd1";
+                               nvidia,function = "uartc";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart3_rx_pd2 {
+                               nvidia,pins = "uart3_rx_pd2";
+                               nvidia,function = "uartc";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart3_rts_pd3 {
+                               nvidia,pins = "uart3_rts_pd3";
+                               nvidia,function = "uartc";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart3_cts_pd4 {
+                               nvidia,pins = "uart3_cts_pd4";
+                               nvidia,function = "uartc";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dmic1_clk_pe0 {
+                               nvidia,pins = "dmic1_clk_pe0";
+                               nvidia,function = "i2s3";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dmic1_dat_pe1 {
+                               nvidia,pins = "dmic1_dat_pe1";
+                               nvidia,function = "i2s3";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dmic2_clk_pe2 {
+                               nvidia,pins = "dmic2_clk_pe2";
+                               nvidia,function = "i2s3";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dmic2_dat_pe3 {
+                               nvidia,pins = "dmic2_dat_pe3";
+                               nvidia,function = "i2s3";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dmic3_clk_pe4 {
+                               nvidia,pins = "dmic3_clk_pe4";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dmic3_dat_pe5 {
+                               nvidia,pins = "dmic3_dat_pe5";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pe6 {
+                               nvidia,pins = "pe6";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pe7 {
+                               nvidia,pins = "pe7";
+                               nvidia,function = "pwm3";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       gen3_i2c_scl_pf0 {
+                               nvidia,pins = "gen3_i2c_scl_pf0";
+                               nvidia,function = "i2c3";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+                       };
+                       gen3_i2c_sda_pf1 {
+                               nvidia,pins = "gen3_i2c_sda_pf1";
+                               nvidia,function = "i2c3";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart2_tx_pg0 {
+                               nvidia,pins = "uart2_tx_pg0";
+                               nvidia,function = "uartb";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart2_rx_pg1 {
+                               nvidia,pins = "uart2_rx_pg1";
+                               nvidia,function = "uartb";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart2_rts_pg2 {
+                               nvidia,pins = "uart2_rts_pg2";
+                               nvidia,function = "uartb";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart2_cts_pg3 {
+                               nvidia,pins = "uart2_cts_pg3";
+                               nvidia,function = "uartb";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       wifi_en_ph0 {
+                               nvidia,pins = "wifi_en_ph0";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       wifi_rst_ph1 {
+                               nvidia,pins = "wifi_rst_ph1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       wifi_wake_ap_ph2 {
+                               nvidia,pins = "wifi_wake_ap_ph2";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       ap_wake_bt_ph3 {
+                               nvidia,pins = "ap_wake_bt_ph3";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       bt_rst_ph4 {
+                               nvidia,pins = "bt_rst_ph4";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       bt_wake_ap_ph5 {
+                               nvidia,pins = "bt_wake_ap_ph5";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       ph6 {
+                               nvidia,pins = "ph6";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       ap_wake_nfc_ph7 {
+                               nvidia,pins = "ap_wake_nfc_ph7";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       nfc_en_pi0 {
+                               nvidia,pins = "nfc_en_pi0";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       nfc_int_pi1 {
+                               nvidia,pins = "nfc_int_pi1";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       gps_en_pi2 {
+                               nvidia,pins = "gps_en_pi2";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       gps_rst_pi3 {
+                               nvidia,pins = "gps_rst_pi3";
+                               nvidia,function = "rsvd0";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart4_tx_pi4 {
+                               nvidia,pins = "uart4_tx_pi4";
+                               nvidia,function = "uartd";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart4_rx_pi5 {
+                               nvidia,pins = "uart4_rx_pi5";
+                               nvidia,function = "uartd";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart4_rts_pi6 {
+                               nvidia,pins = "uart4_rts_pi6";
+                               nvidia,function = "uartd";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart4_cts_pi7 {
+                               nvidia,pins = "uart4_cts_pi7";
+                               nvidia,function = "uartd";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       gen1_i2c_sda_pj0 {
+                               nvidia,pins = "gen1_i2c_sda_pj0";
+                               nvidia,function = "i2c1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+                       };
+                       gen1_i2c_scl_pj1 {
+                               nvidia,pins = "gen1_i2c_scl_pj1";
+                               nvidia,function = "i2c1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+                       };
+                       gen2_i2c_scl_pj2 {
+                               nvidia,pins = "gen2_i2c_scl_pj2";
+                               nvidia,function = "i2c2";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+                       };
+                       gen2_i2c_sda_pj3 {
+                               nvidia,pins = "gen2_i2c_sda_pj3";
+                               nvidia,function = "i2c2";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+                       };
+                       dap4_fs_pj4 {
+                               nvidia,pins = "dap4_fs_pj4";
+                               nvidia,function = "i2s4b";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dap4_din_pj5 {
+                               nvidia,pins = "dap4_din_pj5";
+                               nvidia,function = "i2s4b";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dap4_dout_pj6 {
+                               nvidia,pins = "dap4_dout_pj6";
+                               nvidia,function = "i2s4b";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dap4_sclk_pj7 {
+                               nvidia,pins = "dap4_sclk_pj7";
+                               nvidia,function = "i2s4b";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pk0 {
+                               nvidia,pins = "pk0";
+                               nvidia,function = "i2s5b";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pk1 {
+                               nvidia,pins = "pk1";
+                               nvidia,function = "i2s5b";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pk2 {
+                               nvidia,pins = "pk2";
+                               nvidia,function = "i2s5b";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pk3 {
+                               nvidia,pins = "pk3";
+                               nvidia,function = "i2s5b";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pk4 {
+                               nvidia,pins = "pk4";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pk5 {
+                               nvidia,pins = "pk5";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pk6 {
+                               nvidia,pins = "pk6";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pk7 {
+                               nvidia,pins = "pk7";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pl0 {
+                               nvidia,pins = "pl0";
+                               nvidia,function = "rsvd0";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pl1 {
+                               nvidia,pins = "pl1";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       sdmmc1_clk_pm0 {
+                               nvidia,pins = "sdmmc1_clk_pm0";
+                               nvidia,function = "sdmmc1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       sdmmc1_cmd_pm1 {
+                               nvidia,pins = "sdmmc1_cmd_pm1";
+                               nvidia,function = "sdmmc1";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       sdmmc1_dat3_pm2 {
+                               nvidia,pins = "sdmmc1_dat3_pm2";
+                               nvidia,function = "sdmmc1";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       sdmmc1_dat2_pm3 {
+                               nvidia,pins = "sdmmc1_dat2_pm3";
+                               nvidia,function = "sdmmc1";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       sdmmc1_dat1_pm4 {
+                               nvidia,pins = "sdmmc1_dat1_pm4";
+                               nvidia,function = "sdmmc1";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       sdmmc1_dat0_pm5 {
+                               nvidia,pins = "sdmmc1_dat0_pm5";
+                               nvidia,function = "sdmmc1";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       sdmmc3_clk_pp0 {
+                               nvidia,pins = "sdmmc3_clk_pp0";
+                               nvidia,function = "sdmmc3";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       sdmmc3_cmd_pp1 {
+                               nvidia,pins = "sdmmc3_cmd_pp1";
+                               nvidia,function = "sdmmc3";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       sdmmc3_dat3_pp2 {
+                               nvidia,pins = "sdmmc3_dat3_pp2";
+                               nvidia,function = "sdmmc3";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       sdmmc3_dat2_pp3 {
+                               nvidia,pins = "sdmmc3_dat2_pp3";
+                               nvidia,function = "sdmmc3";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       sdmmc3_dat1_pp4 {
+                               nvidia,pins = "sdmmc3_dat1_pp4";
+                               nvidia,function = "sdmmc3";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       sdmmc3_dat0_pp5 {
+                               nvidia,pins = "sdmmc3_dat0_pp5";
+                               nvidia,function = "sdmmc3";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       cam1_mclk_ps0 {
+                               nvidia,pins = "cam1_mclk_ps0";
+                               nvidia,function = "extperiph3";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       cam2_mclk_ps1 {
+                               nvidia,pins = "cam2_mclk_ps1";
+                               nvidia,function = "extperiph3";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       cam_i2c_scl_ps2 {
+                               nvidia,pins = "cam_i2c_scl_ps2";
+                               nvidia,function = "i2cvi";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+                       };
+                       cam_i2c_sda_ps3 {
+                               nvidia,pins = "cam_i2c_sda_ps3";
+                               nvidia,function = "i2cvi";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+                       };
+                       cam_rst_ps4 {
+                               nvidia,pins = "cam_rst_ps4";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       cam_af_en_ps5 {
+                               nvidia,pins = "cam_af_en_ps5";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       cam_flash_en_ps6 {
+                               nvidia,pins = "cam_flash_en_ps6";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       cam1_pwdn_ps7 {
+                               nvidia,pins = "cam1_pwdn_ps7";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       cam2_pwdn_pt0 {
+                               nvidia,pins = "cam2_pwdn_pt0";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       cam1_strobe_pt1 {
+                               nvidia,pins = "cam1_strobe_pt1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart1_tx_pu0 {
+                               nvidia,pins = "uart1_tx_pu0";
+                               nvidia,function = "uarta";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart1_rx_pu1 {
+                               nvidia,pins = "uart1_rx_pu1";
+                               nvidia,function = "uarta";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart1_rts_pu2 {
+                               nvidia,pins = "uart1_rts_pu2";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       uart1_cts_pu3 {
+                               nvidia,pins = "uart1_cts_pu3";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       lcd_bl_pwm_pv0 {
+                               nvidia,pins = "lcd_bl_pwm_pv0";
+                               nvidia,function = "pwm0";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       lcd_bl_en_pv1 {
+                               nvidia,pins = "lcd_bl_en_pv1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       lcd_rst_pv2 {
+                               nvidia,pins = "lcd_rst_pv2";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       lcd_gpio1_pv3 {
+                               nvidia,pins = "lcd_gpio1_pv3";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       lcd_gpio2_pv4 {
+                               nvidia,pins = "lcd_gpio2_pv4";
+                               nvidia,function = "pwm1";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       ap_ready_pv5 {
+                               nvidia,pins = "ap_ready_pv5";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       touch_rst_pv6 {
+                               nvidia,pins = "touch_rst_pv6";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       touch_clk_pv7 {
+                               nvidia,pins = "touch_clk_pv7";
+                               nvidia,function = "touch";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       modem_wake_ap_px0 {
+                               nvidia,pins = "modem_wake_ap_px0";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       touch_int_px1 {
+                               nvidia,pins = "touch_int_px1";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       motion_int_px2 {
+                               nvidia,pins = "motion_int_px2";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       als_prox_int_px3 {
+                               nvidia,pins = "als_prox_int_px3";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       temp_alert_px4 {
+                               nvidia,pins = "temp_alert_px4";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       button_power_on_px5 {
+                               nvidia,pins = "button_power_on_px5";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       button_vol_up_px6 {
+                               nvidia,pins = "button_vol_up_px6";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       button_vol_down_px7 {
+                               nvidia,pins = "button_vol_down_px7";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       button_slide_sw_py0 {
+                               nvidia,pins = "button_slide_sw_py0";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       button_home_py1 {
+                               nvidia,pins = "button_home_py1";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       lcd_te_py2 {
+                               nvidia,pins = "lcd_te_py2";
+                               nvidia,function = "displaya";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pwr_i2c_scl_py3 {
+                               nvidia,pins = "pwr_i2c_scl_py3";
+                               nvidia,function = "i2cpmu";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+                       };
+                       pwr_i2c_sda_py4 {
+                               nvidia,pins = "pwr_i2c_sda_py4";
+                               nvidia,function = "i2cpmu";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+                       };
+                       clk_32k_out_py5 {
+                               nvidia,pins = "clk_32k_out_py5";
+                               nvidia,function = "soc";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pz0 {
+                               nvidia,pins = "pz0";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pz1 {
+                               nvidia,pins = "pz1";
+                               nvidia,function = "sdmmc1";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pz2 {
+                               nvidia,pins = "pz2";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pz3 {
+                               nvidia,pins = "pz3";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pz4 {
+                               nvidia,pins = "pz4";
+                               nvidia,function = "sdmmc1";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pz5 {
+                               nvidia,pins = "pz5";
+                               nvidia,function = "soc";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dap2_fs_paa0 {
+                               nvidia,pins = "dap2_fs_paa0";
+                               nvidia,function = "i2s2";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dap2_sclk_paa1 {
+                               nvidia,pins = "dap2_sclk_paa1";
+                               nvidia,function = "i2s2";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dap2_din_paa2 {
+                               nvidia,pins = "dap2_din_paa2";
+                               nvidia,function = "i2s2";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dap2_dout_paa3 {
+                               nvidia,pins = "dap2_dout_paa3";
+                               nvidia,function = "i2s2";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       aud_mclk_pbb0 {
+                               nvidia,pins = "aud_mclk_pbb0";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dvfs_pwm_pbb1 {
+                               nvidia,pins = "dvfs_pwm_pbb1";
+                               nvidia,function = "cldvfs";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       dvfs_clk_pbb2 {
+                               nvidia,pins = "dvfs_clk_pbb2";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       gpio_x1_aud_pbb3 {
+                               nvidia,pins = "gpio_x1_aud_pbb3";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       gpio_x3_aud_pbb4 {
+                               nvidia,pins = "gpio_x3_aud_pbb4";
+                               nvidia,function = "rsvd0";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       hdmi_cec_pcc0 {
+                               nvidia,pins = "hdmi_cec_pcc0";
+                               nvidia,function = "cec";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+                       };
+                       hdmi_int_dp_hpd_pcc1 {
+                               nvidia,pins = "hdmi_int_dp_hpd_pcc1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+                       };
+                       spdif_out_pcc2 {
+                               nvidia,pins = "spdif_out_pcc2";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       spdif_in_pcc3 {
+                               nvidia,pins = "spdif_in_pcc3";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       usb_vbus_en0_pcc4 {
+                               nvidia,pins = "usb_vbus_en0_pcc4";
+                               nvidia,function = "usb";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+                       };
+                       usb_vbus_en1_pcc5 {
+                               nvidia,pins = "usb_vbus_en1_pcc5";
+                               nvidia,function = "usb";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_ENABLE>;
+                       };
+                       dp_hpd0_pcc6 {
+                               nvidia,pins = "dp_hpd0_pcc6";
+                               nvidia,function = "dp";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pcc7 {
+                               nvidia,pins = "pcc7";
+                               nvidia,function = "rsvd0";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                               nvidia,io-hv = <TEGRA_PIN_DISABLE>;
+                       };
+                       spi2_cs1_pdd0 {
+                               nvidia,pins = "spi2_cs1_pdd0";
+                               nvidia,function = "spi2";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       qspi_sck_pee0 {
+                               nvidia,pins = "qspi_sck_pee0";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       qspi_cs_n_pee1 {
+                               nvidia,pins = "qspi_cs_n_pee1";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       qspi_io0_pee2 {
+                               nvidia,pins = "qspi_io0_pee2";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       qspi_io1_pee3 {
+                               nvidia,pins = "qspi_io1_pee3";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       qspi_io2_pee4 {
+                               nvidia,pins = "qspi_io2_pee4";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       qspi_io3_pee5 {
+                               nvidia,pins = "qspi_io3_pee5";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       core_pwr_req {
+                               nvidia,pins = "core_pwr_req";
+                               nvidia,function = "core";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       cpu_pwr_req {
+                               nvidia,pins = "cpu_pwr_req";
+                               nvidia,function = "cpu";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       pwr_int_n {
+                               nvidia,pins = "pwr_int_n";
+                               nvidia,function = "pmi";
+                               nvidia,pull = <TEGRA_PIN_PULL_UP>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       clk_32k_in {
+                               nvidia,pins = "clk_32k_in";
+                               nvidia,function = "clk";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       jtag_rtck {
+                               nvidia,pins = "jtag_rtck";
+                               nvidia,function = "jtag";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       clk_req {
+                               nvidia,pins = "clk_req";
+                               nvidia,function = "rsvd1";
+                               nvidia,pull = <TEGRA_PIN_PULL_DOWN>;
+                               nvidia,tristate = <TEGRA_PIN_ENABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+                       shutdown {
+                               nvidia,pins = "shutdown";
+                               nvidia,function = "shutdown";
+                               nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+                               nvidia,tristate = <TEGRA_PIN_DISABLE>;
+                               nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+                               nvidia,open-drain = <TEGRA_PIN_DISABLE>;
+                       };
+               };
+       };
+
+       /* MMC/SD */
+       sdhci@0,700b0000 {
+               status = "okay";
+               bus-width = <4>;
+               no-1-8-v;
+
+               cd-gpios = <&gpio TEGRA_GPIO(Z, 1) GPIO_ACTIVE_LOW>;
+       };
+};
diff --git a/arch/arm64/boot/dts/nvidia/tegra210.dtsi b/arch/arm64/boot/dts/nvidia/tegra210.dtsi
new file mode 100644 (file)
index 0000000..bc23f4d
--- /dev/null
@@ -0,0 +1,805 @@
+#include <dt-bindings/clock/tegra210-car.h>
+#include <dt-bindings/gpio/tegra-gpio.h>
+#include <dt-bindings/memory/tegra210-mc.h>
+#include <dt-bindings/pinctrl/pinctrl-tegra.h>
+#include <dt-bindings/interrupt-controller/arm-gic.h>
+
+/ {
+       compatible = "nvidia,tegra210";
+       interrupt-parent = <&lic>;
+       #address-cells = <2>;
+       #size-cells = <2>;
+
+       host1x@0,50000000 {
+               compatible = "nvidia,tegra210-host1x", "simple-bus";
+               reg = <0x0 0x50000000 0x0 0x00034000>;
+               interrupts = <GIC_SPI 65 IRQ_TYPE_LEVEL_HIGH>, /* syncpt */
+                            <GIC_SPI 67 IRQ_TYPE_LEVEL_HIGH>; /* general */
+               clocks = <&tegra_car TEGRA210_CLK_HOST1X>;
+               clock-names = "host1x";
+               resets = <&tegra_car 28>;
+               reset-names = "host1x";
+
+               #address-cells = <2>;
+               #size-cells = <2>;
+
+               ranges = <0x0 0x54000000 0x0 0x54000000 0x0 0x01000000>;
+
+               dpaux1: dpaux@0,54040000 {
+                       compatible = "nvidia,tegra210-dpaux";
+                       reg = <0x0 0x54040000 0x0 0x00040000>;
+                       interrupts = <GIC_SPI 11 IRQ_TYPE_LEVEL_HIGH>;
+                       clocks = <&tegra_car TEGRA210_CLK_DPAUX1>,
+                                <&tegra_car TEGRA210_CLK_PLL_DP>;
+                       clock-names = "dpaux", "parent";
+                       resets = <&tegra_car 207>;
+                       reset-names = "dpaux";
+                       status = "disabled";
+               };
+
+               vi@0,54080000 {
+                       compatible = "nvidia,tegra210-vi";
+                       reg = <0x0 0x54080000 0x0 0x00040000>;
+                       interrupts = <GIC_SPI 69 IRQ_TYPE_LEVEL_HIGH>;
+                       status = "disabled";
+               };
+
+               tsec@0,54100000 {
+                       compatible = "nvidia,tegra210-tsec";
+                       reg = <0x0 0x54100000 0x0 0x00040000>;
+               };
+
+               dc@0,54200000 {
+                       compatible = "nvidia,tegra210-dc";
+                       reg = <0x0 0x54200000 0x0 0x00040000>;
+                       interrupts = <GIC_SPI 73 IRQ_TYPE_LEVEL_HIGH>;
+                       clocks = <&tegra_car TEGRA210_CLK_DISP1>,
+                                <&tegra_car TEGRA210_CLK_PLL_P>;
+                       clock-names = "dc", "parent";
+                       resets = <&tegra_car 27>;
+                       reset-names = "dc";
+
+                       iommus = <&mc TEGRA_SWGROUP_DC>;
+
+                       nvidia,head = <0>;
+               };
+
+               dc@0,54240000 {
+                       compatible = "nvidia,tegra210-dc";
+                       reg = <0x0 0x54240000 0x0 0x00040000>;
+                       interrupts = <GIC_SPI 74 IRQ_TYPE_LEVEL_HIGH>;
+                       clocks = <&tegra_car TEGRA210_CLK_DISP2>,
+                                <&tegra_car TEGRA210_CLK_PLL_P>;
+                       clock-names = "dc", "parent";
+                       resets = <&tegra_car 26>;
+                       reset-names = "dc";
+
+                       iommus = <&mc TEGRA_SWGROUP_DCB>;
+
+                       nvidia,head = <1>;
+               };
+
+               dsi@0,54300000 {
+                       compatible = "nvidia,tegra210-dsi";
+                       reg = <0x0 0x54300000 0x0 0x00040000>;
+                       clocks = <&tegra_car TEGRA210_CLK_DSIA>,
+                                <&tegra_car TEGRA210_CLK_DSIALP>,
+                                <&tegra_car TEGRA210_CLK_PLL_D_OUT0>;
+                       clock-names = "dsi", "lp", "parent";
+                       resets = <&tegra_car 48>;
+                       reset-names = "dsi";
+                       nvidia,mipi-calibrate = <&mipi 0x0c0>; /* DSIA & DSIB pads */
+
+                       status = "disabled";
+
+                       #address-cells = <1>;
+                       #size-cells = <0>;
+               };
+
+               vic@0,54340000 {
+                       compatible = "nvidia,tegra210-vic";
+                       reg = <0x0 0x54340000 0x0 0x00040000>;
+                       status = "disabled";
+               };
+
+               nvjpg@0,54380000 {
+                       compatible = "nvidia,tegra210-nvjpg";
+                       reg = <0x0 0x54380000 0x0 0x00040000>;
+                       status = "disabled";
+               };
+
+               dsi@0,54400000 {
+                       compatible = "nvidia,tegra210-dsi";
+                       reg = <0x0 0x54400000 0x0 0x00040000>;
+                       clocks = <&tegra_car TEGRA210_CLK_DSIB>,
+                                <&tegra_car TEGRA210_CLK_DSIBLP>,
+                                <&tegra_car TEGRA210_CLK_PLL_D_OUT0>;
+                       clock-names = "dsi", "lp", "parent";
+                       resets = <&tegra_car 82>;
+                       reset-names = "dsi";
+                       nvidia,mipi-calibrate = <&mipi 0x300>; /* DSIC & DSID pads */
+
+                       status = "disabled";
+
+                       #address-cells = <1>;
+                       #size-cells = <0>;
+               };
+
+               nvdec@0,54480000 {
+                       compatible = "nvidia,tegra210-nvdec";
+                       reg = <0x0 0x54480000 0x0 0x00040000>;
+                       status = "disabled";
+               };
+
+               nvenc@0,544c0000 {
+                       compatible = "nvidia,tegra210-nvenc";
+                       reg = <0x0 0x544c0000 0x0 0x00040000>;
+                       status = "disabled";
+               };
+
+               tsec@0,54500000 {
+                       compatible = "nvidia,tegra210-tsec";
+                       reg = <0x0 0x54500000 0x0 0x00040000>;
+                       status = "disabled";
+               };
+
+               sor@0,54540000 {
+                       compatible = "nvidia,tegra210-sor";
+                       reg = <0x0 0x54540000 0x0 0x00040000>;
+                       interrupts = <GIC_SPI 76 IRQ_TYPE_LEVEL_HIGH>;
+                       clocks = <&tegra_car TEGRA210_CLK_SOR0>,
+                                <&tegra_car TEGRA210_CLK_PLL_D_OUT0>,
+                                <&tegra_car TEGRA210_CLK_PLL_DP>,
+                                <&tegra_car TEGRA210_CLK_SOR_SAFE>;
+                       clock-names = "sor", "parent", "dp", "safe";
+                       resets = <&tegra_car 182>;
+                       reset-names = "sor";
+                       status = "disabled";
+               };
+
+               sor@0,54580000 {
+                       compatible = "nvidia,tegra210-sor1";
+                       reg = <0x0 0x54580000 0x0 0x00040000>;
+                       interrupts = <GIC_SPI 75 IRQ_TYPE_LEVEL_HIGH>;
+                       clocks = <&tegra_car TEGRA210_CLK_SOR1>,
+                                <&tegra_car TEGRA210_CLK_PLL_D2_OUT0>,
+                                <&tegra_car TEGRA210_CLK_PLL_DP>,
+                                <&tegra_car TEGRA210_CLK_SOR_SAFE>;
+                       clock-names = "sor", "parent", "dp", "safe";
+                       resets = <&tegra_car 183>;
+                       reset-names = "sor";
+                       status = "disabled";
+               };
+
+               dpaux: dpaux@0,545c0000 {
+                       compatible = "nvidia,tegra124-dpaux";
+                       reg = <0x0 0x545c0000 0x0 0x00040000>;
+                       interrupts = <GIC_SPI 159 IRQ_TYPE_LEVEL_HIGH>;
+                       clocks = <&tegra_car TEGRA210_CLK_DPAUX>,
+                                <&tegra_car TEGRA210_CLK_PLL_DP>;
+                       clock-names = "dpaux", "parent";
+                       resets = <&tegra_car 181>;
+                       reset-names = "dpaux";
+                       status = "disabled";
+               };
+
+               isp@0,54600000 {
+                       compatible = "nvidia,tegra210-isp";
+                       reg = <0x0 0x54600000 0x0 0x00040000>;
+                       interrupts = <GIC_SPI 71 IRQ_TYPE_LEVEL_HIGH>;
+                       status = "disabled";
+               };
+
+               isp@0,54680000 {
+                       compatible = "nvidia,tegra210-isp";
+                       reg = <0x0 0x54680000 0x0 0x00040000>;
+                       interrupts = <GIC_SPI 70 IRQ_TYPE_LEVEL_HIGH>;
+                       status = "disabled";
+               };
+
+               i2c@0,546c0000 {
+                       compatible = "nvidia,tegra210-i2c-vi";
+                       reg = <0x0 0x546c0000 0x0 0x00040000>;
+                       interrupts = <GIC_SPI 17 IRQ_TYPE_LEVEL_HIGH>;
+                       status = "disabled";
+               };
+       };
+
+       gic: interrupt-controller@0,50041000 {
+               compatible = "arm,gic-400";
+               #interrupt-cells = <3>;
+               interrupt-controller;
+               reg = <0x0 0x50041000 0x0 0x1000>,
+                     <0x0 0x50042000 0x0 0x2000>,
+                     <0x0 0x50044000 0x0 0x2000>,
+                     <0x0 0x50046000 0x0 0x2000>;
+               interrupts = <GIC_PPI 9
+                       (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_HIGH)>;
+               interrupt-parent = <&gic>;
+       };
+
+       gpu@0,57000000 {
+               compatible = "nvidia,gm20b";
+               reg = <0x0 0x57000000 0x0 0x01000000>,
+                     <0x0 0x58000000 0x0 0x01000000>;
+               interrupts = <GIC_SPI 157 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 158 IRQ_TYPE_LEVEL_HIGH>;
+               interrupt-names = "stall", "nonstall";
+               clocks = <&tegra_car TEGRA210_CLK_GPU>,
+                        <&tegra_car TEGRA210_CLK_PLL_P_OUT5>;
+               clock-names = "gpu", "pwr";
+               resets = <&tegra_car 184>;
+               reset-names = "gpu";
+               status = "disabled";
+       };
+
+       lic: interrupt-controller@0,60004000 {
+               compatible = "nvidia,tegra210-ictlr";
+               reg = <0x0 0x60004000 0x0 0x40>, /* primary controller */
+                     <0x0 0x60004100 0x0 0x40>, /* secondary controller */
+                     <0x0 0x60004200 0x0 0x40>, /* tertiary controller */
+                     <0x0 0x60004300 0x0 0x40>, /* quaternary controller */
+                     <0x0 0x60004400 0x0 0x40>, /* quinary controller */
+                     <0x0 0x60004500 0x0 0x40>; /* senary controller */
+               interrupt-controller;
+               #interrupt-cells = <3>;
+               interrupt-parent = <&gic>;
+       };
+
+       timer@0,60005000 {
+               compatible = "nvidia,tegra210-timer", "nvidia,tegra20-timer";
+               reg = <0x0 0x60005000 0x0 0x400>;
+               interrupts = <GIC_SPI 0 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 1 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 41 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 42 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 121 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 122 IRQ_TYPE_LEVEL_HIGH>;
+               clocks = <&tegra_car TEGRA210_CLK_TIMER>;
+               clock-names = "timer";
+       };
+
+       tegra_car: clock@0,60006000 {
+               compatible = "nvidia,tegra210-car";
+               reg = <0x0 0x60006000 0x0 0x1000>;
+               #clock-cells = <1>;
+               #reset-cells = <1>;
+       };
+
+       flow-controller@0,60007000 {
+               compatible = "nvidia,tegra210-flowctrl";
+               reg = <0x0 0x60007000 0x0 0x1000>;
+       };
+
+       gpio: gpio@0,6000d000 {
+               compatible = "nvidia,tegra210-gpio", "nvidia,tegra124-gpio", "nvidia,tegra30-gpio";
+               reg = <0x0 0x6000d000 0x0 0x1000>;
+               interrupts = <GIC_SPI 32 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 33 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 34 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 35 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 55 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 87 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 89 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 125 IRQ_TYPE_LEVEL_HIGH>;
+               #gpio-cells = <2>;
+               gpio-controller;
+               #interrupt-cells = <2>;
+               interrupt-controller;
+       };
+
+       apbdma: dma@0,60020000 {
+               compatible = "nvidia,tegra210-apbdma", "nvidia,tegra148-apbdma";
+               reg = <0x0 0x60020000 0x0 0x1400>;
+               interrupts = <GIC_SPI 104 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 105 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 106 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 107 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 108 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 109 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 110 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 111 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 112 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 113 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 114 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 115 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 116 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 117 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 118 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 119 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 128 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 129 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 130 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 131 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 132 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 133 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 134 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 135 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 136 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 137 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 138 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 139 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 140 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 141 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 142 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 143 IRQ_TYPE_LEVEL_HIGH>;
+               clocks = <&tegra_car TEGRA210_CLK_APBDMA>;
+               clock-names = "dma";
+               resets = <&tegra_car 34>;
+               reset-names = "dma";
+               #dma-cells = <1>;
+       };
+
+       apbmisc@0,70000800 {
+               compatible = "nvidia,tegra210-apbmisc", "nvidia,tegra20-apbmisc";
+               reg = <0x0 0x70000800 0x0 0x64>,   /* Chip revision */
+                     <0x0 0x7000e864 0x0 0x04>;   /* Strapping options */
+       };
+
+       pinmux: pinmux@0,700008d4 {
+               compatible = "nvidia,tegra210-pinmux";
+               reg = <0x0 0x700008d4 0x0 0x29c>, /* Pad control registers */
+                     <0x0 0x70003000 0x0 0x294>; /* Mux registers */
+       };
+
+       /*
+        * There are two serial driver i.e. 8250 based simple serial
+        * driver and APB DMA based serial driver for higher baudrate
+        * and performace. To enable the 8250 based driver, the compatible
+        * is "nvidia,tegra124-uart", "nvidia,tegra20-uart" and to enable
+        * the APB DMA based serial driver, the comptible is
+        * "nvidia,tegra124-hsuart", "nvidia,tegra30-hsuart".
+        */
+       uarta: serial@0,70006000 {
+               compatible = "nvidia,tegra210-uart", "nvidia,tegra20-uart";
+               reg = <0x0 0x70006000 0x0 0x40>;
+               reg-shift = <2>;
+               interrupts = <GIC_SPI 36 IRQ_TYPE_LEVEL_HIGH>;
+               clocks = <&tegra_car TEGRA210_CLK_UARTA>;
+               clock-names = "serial";
+               resets = <&tegra_car 6>;
+               reset-names = "serial";
+               dmas = <&apbdma 8>, <&apbdma 8>;
+               dma-names = "rx", "tx";
+               status = "disabled";
+       };
+
+       uartb: serial@0,70006040 {
+               compatible = "nvidia,tegra210-uart", "nvidia,tegra20-uart";
+               reg = <0x0 0x70006040 0x0 0x40>;
+               reg-shift = <2>;
+               interrupts = <GIC_SPI 37 IRQ_TYPE_LEVEL_HIGH>;
+               clocks = <&tegra_car TEGRA210_CLK_UARTB>;
+               clock-names = "serial";
+               resets = <&tegra_car 7>;
+               reset-names = "serial";
+               dmas = <&apbdma 9>, <&apbdma 9>;
+               dma-names = "rx", "tx";
+               status = "disabled";
+       };
+
+       uartc: serial@0,70006200 {
+               compatible = "nvidia,tegra210-uart", "nvidia,tegra20-uart";
+               reg = <0x0 0x70006200 0x0 0x40>;
+               reg-shift = <2>;
+               interrupts = <GIC_SPI 46 IRQ_TYPE_LEVEL_HIGH>;
+               clocks = <&tegra_car TEGRA210_CLK_UARTC>;
+               clock-names = "serial";
+               resets = <&tegra_car 55>;
+               reset-names = "serial";
+               dmas = <&apbdma 10>, <&apbdma 10>;
+               dma-names = "rx", "tx";
+               status = "disabled";
+       };
+
+       uartd: serial@0,70006300 {
+               compatible = "nvidia,tegra210-uart", "nvidia,tegra20-uart";
+               reg = <0x0 0x70006300 0x0 0x40>;
+               reg-shift = <2>;
+               interrupts = <GIC_SPI 90 IRQ_TYPE_LEVEL_HIGH>;
+               clocks = <&tegra_car TEGRA210_CLK_UARTD>;
+               clock-names = "serial";
+               resets = <&tegra_car 65>;
+               reset-names = "serial";
+               dmas = <&apbdma 19>, <&apbdma 19>;
+               dma-names = "rx", "tx";
+               status = "disabled";
+       };
+
+       pwm: pwm@0,7000a000 {
+               compatible = "nvidia,tegra210-pwm", "nvidia,tegra20-pwm";
+               reg = <0x0 0x7000a000 0x0 0x100>;
+               #pwm-cells = <2>;
+               clocks = <&tegra_car TEGRA210_CLK_PWM>;
+               clock-names = "pwm";
+               resets = <&tegra_car 17>;
+               reset-names = "pwm";
+               status = "disabled";
+       };
+
+       i2c@0,7000c000 {
+               compatible = "nvidia,tegra210-i2c", "nvidia,tegra114-i2c";
+               reg = <0x0 0x7000c000 0x0 0x100>;
+               interrupts = <GIC_SPI 38 IRQ_TYPE_LEVEL_HIGH>;
+               #address-cells = <1>;
+               #size-cells = <0>;
+               clocks = <&tegra_car TEGRA210_CLK_I2C1>;
+               clock-names = "div-clk";
+               resets = <&tegra_car 12>;
+               reset-names = "i2c";
+               dmas = <&apbdma 21>, <&apbdma 21>;
+               dma-names = "rx", "tx";
+               status = "disabled";
+       };
+
+       i2c@0,7000c400 {
+               compatible = "nvidia,tegra210-i2c", "nvidia,tegra114-i2c";
+               reg = <0x0 0x7000c400 0x0 0x100>;
+               interrupts = <GIC_SPI 84 IRQ_TYPE_LEVEL_HIGH>;
+               #address-cells = <1>;
+               #size-cells = <0>;
+               clocks = <&tegra_car TEGRA210_CLK_I2C2>;
+               clock-names = "div-clk";
+               resets = <&tegra_car 54>;
+               reset-names = "i2c";
+               dmas = <&apbdma 22>, <&apbdma 22>;
+               dma-names = "rx", "tx";
+               status = "disabled";
+       };
+
+       i2c@0,7000c500 {
+               compatible = "nvidia,tegra210-i2c", "nvidia,tegra114-i2c";
+               reg = <0x0 0x7000c500 0x0 0x100>;
+               interrupts = <GIC_SPI 92 IRQ_TYPE_LEVEL_HIGH>;
+               #address-cells = <1>;
+               #size-cells = <0>;
+               clocks = <&tegra_car TEGRA210_CLK_I2C3>;
+               clock-names = "div-clk";
+               resets = <&tegra_car 67>;
+               reset-names = "i2c";
+               dmas = <&apbdma 23>, <&apbdma 23>;
+               dma-names = "rx", "tx";
+               status = "disabled";
+       };
+
+       i2c@0,7000c700 {
+               compatible = "nvidia,tegra210-i2c", "nvidia,tegra114-i2c";
+               reg = <0x0 0x7000c700 0x0 0x100>;
+               interrupts = <GIC_SPI 120 IRQ_TYPE_LEVEL_HIGH>;
+               #address-cells = <1>;
+               #size-cells = <0>;
+               clocks = <&tegra_car TEGRA210_CLK_I2C4>;
+               clock-names = "div-clk";
+               resets = <&tegra_car 103>;
+               reset-names = "i2c";
+               dmas = <&apbdma 26>, <&apbdma 26>;
+               dma-names = "rx", "tx";
+               status = "disabled";
+       };
+
+       i2c@0,7000d000 {
+               compatible = "nvidia,tegra210-i2c", "nvidia,tegra114-i2c";
+               reg = <0x0 0x7000d000 0x0 0x100>;
+               interrupts = <GIC_SPI 53 IRQ_TYPE_LEVEL_HIGH>;
+               #address-cells = <1>;
+               #size-cells = <0>;
+               clocks = <&tegra_car TEGRA210_CLK_I2C5>;
+               clock-names = "div-clk";
+               resets = <&tegra_car 47>;
+               reset-names = "i2c";
+               dmas = <&apbdma 24>, <&apbdma 24>;
+               dma-names = "rx", "tx";
+               status = "disabled";
+       };
+
+       i2c@0,7000d100 {
+               compatible = "nvidia,tegra210-i2c", "nvidia,tegra114-i2c";
+               reg = <0x0 0x7000d100 0x0 0x100>;
+               interrupts = <GIC_SPI 63 IRQ_TYPE_LEVEL_HIGH>;
+               #address-cells = <1>;
+               #size-cells = <0>;
+               clocks = <&tegra_car TEGRA210_CLK_I2C6>;
+               clock-names = "div-clk";
+               resets = <&tegra_car 166>;
+               reset-names = "i2c";
+               dmas = <&apbdma 30>, <&apbdma 30>;
+               dma-names = "rx", "tx";
+               status = "disabled";
+       };
+
+       spi@0,7000d400 {
+               compatible = "nvidia,tegra210-spi", "nvidia,tegra114-spi";
+               reg = <0x0 0x7000d400 0x0 0x200>;
+               interrupts = <GIC_SPI 59 IRQ_TYPE_LEVEL_HIGH>;
+               #address-cells = <1>;
+               #size-cells = <0>;
+               clocks = <&tegra_car TEGRA210_CLK_SBC1>;
+               clock-names = "spi";
+               resets = <&tegra_car 41>;
+               reset-names = "spi";
+               dmas = <&apbdma 15>, <&apbdma 15>;
+               dma-names = "rx", "tx";
+               status = "disabled";
+       };
+
+       spi@0,7000d600 {
+               compatible = "nvidia,tegra210-spi", "nvidia,tegra114-spi";
+               reg = <0x0 0x7000d600 0x0 0x200>;
+               interrupts = <GIC_SPI 82 IRQ_TYPE_LEVEL_HIGH>;
+               #address-cells = <1>;
+               #size-cells = <0>;
+               clocks = <&tegra_car TEGRA210_CLK_SBC2>;
+               clock-names = "spi";
+               resets = <&tegra_car 44>;
+               reset-names = "spi";
+               dmas = <&apbdma 16>, <&apbdma 16>;
+               dma-names = "rx", "tx";
+               status = "disabled";
+       };
+
+       spi@0,7000d800 {
+               compatible = "nvidia,tegra210-spi", "nvidia,tegra114-spi";
+               reg = <0x0 0x7000d800 0x0 0x200>;
+               interrupts = <GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>;
+               #address-cells = <1>;
+               #size-cells = <0>;
+               clocks = <&tegra_car TEGRA210_CLK_SBC3>;
+               clock-names = "spi";
+               resets = <&tegra_car 46>;
+               reset-names = "spi";
+               dmas = <&apbdma 17>, <&apbdma 17>;
+               dma-names = "rx", "tx";
+               status = "disabled";
+       };
+
+       spi@0,7000da00 {
+               compatible = "nvidia,tegra210-spi", "nvidia,tegra114-spi";
+               reg = <0x0 0x7000da00 0x0 0x200>;
+               interrupts = <GIC_SPI 93 IRQ_TYPE_LEVEL_HIGH>;
+               #address-cells = <1>;
+               #size-cells = <0>;
+               clocks = <&tegra_car TEGRA210_CLK_SBC4>;
+               clock-names = "spi";
+               resets = <&tegra_car 68>;
+               reset-names = "spi";
+               dmas = <&apbdma 18>, <&apbdma 18>;
+               dma-names = "rx", "tx";
+               status = "disabled";
+       };
+
+       rtc@0,7000e000 {
+               compatible = "nvidia,tegra210-rtc", "nvidia,tegra20-rtc";
+               reg = <0x0 0x7000e000 0x0 0x100>;
+               interrupts = <GIC_SPI 2 IRQ_TYPE_LEVEL_HIGH>;
+               clocks = <&tegra_car TEGRA210_CLK_RTC>;
+               clock-names = "rtc";
+       };
+
+       pmc: pmc@0,7000e400 {
+               compatible = "nvidia,tegra210-pmc";
+               reg = <0x0 0x7000e400 0x0 0x400>;
+               clocks = <&tegra_car TEGRA210_CLK_PCLK>, <&clk32k_in>;
+               clock-names = "pclk", "clk32k_in";
+
+               #power-domain-cells = <1>;
+       };
+
+       fuse@0,7000f800 {
+               compatible = "nvidia,tegra210-efuse";
+               reg = <0x0 0x7000f800 0x0 0x400>;
+               clocks = <&tegra_car TEGRA210_CLK_FUSE>;
+               clock-names = "fuse";
+               resets = <&tegra_car 39>;
+               reset-names = "fuse";
+       };
+
+       mc: memory-controller@0,70019000 {
+               compatible = "nvidia,tegra210-mc";
+               reg = <0x0 0x70019000 0x0 0x1000>;
+               clocks = <&tegra_car TEGRA210_CLK_MC>;
+               clock-names = "mc";
+
+               interrupts = <GIC_SPI 77 IRQ_TYPE_LEVEL_HIGH>;
+
+               #iommu-cells = <1>;
+       };
+
+       hda@0,70030000 {
+               compatible = "nvidia,tegra210-hda", "nvidia,tegra30-hda";
+               reg = <0x0 0x70030000 0x0 0x10000>;
+               interrupts = <GIC_SPI 81 IRQ_TYPE_LEVEL_HIGH>;
+               clocks = <&tegra_car TEGRA210_CLK_HDA>,
+                        <&tegra_car TEGRA210_CLK_HDA2HDMI>,
+                        <&tegra_car TEGRA210_CLK_HDA2CODEC_2X>;
+               clock-names = "hda", "hda2hdmi", "hda2codec_2x";
+               resets = <&tegra_car 125>, /* hda */
+                        <&tegra_car 128>, /* hda2hdmi */
+                        <&tegra_car 111>; /* hda2codec_2x */
+               reset-names = "hda", "hda2hdmi", "hda2codec_2x";
+               status = "disabled";
+       };
+
+       sdhci@0,700b0000 {
+               compatible = "nvidia,tegra210-sdhci", "nvidia,tegra124-sdhci";
+               reg = <0x0 0x700b0000 0x0 0x200>;
+               interrupts = <GIC_SPI 14 IRQ_TYPE_LEVEL_HIGH>;
+               clocks = <&tegra_car TEGRA210_CLK_SDMMC1>;
+               clock-names = "sdhci";
+               resets = <&tegra_car 14>;
+               reset-names = "sdhci";
+               status = "disabled";
+       };
+
+       sdhci@0,700b0200 {
+               compatible = "nvidia,tegra210-sdhci", "nvidia,tegra124-sdhci";
+               reg = <0x0 0x700b0200 0x0 0x200>;
+               interrupts = <GIC_SPI 15 IRQ_TYPE_LEVEL_HIGH>;
+               clocks = <&tegra_car TEGRA210_CLK_SDMMC2>;
+               clock-names = "sdhci";
+               resets = <&tegra_car 9>;
+               reset-names = "sdhci";
+               status = "disabled";
+       };
+
+       sdhci@0,700b0400 {
+               compatible = "nvidia,tegra210-sdhci", "nvidia,tegra124-sdhci";
+               reg = <0x0 0x700b0400 0x0 0x200>;
+               interrupts = <GIC_SPI 19 IRQ_TYPE_LEVEL_HIGH>;
+               clocks = <&tegra_car TEGRA210_CLK_SDMMC3>;
+               clock-names = "sdhci";
+               resets = <&tegra_car 69>;
+               reset-names = "sdhci";
+               status = "disabled";
+       };
+
+       sdhci@0,700b0600 {
+               compatible = "nvidia,tegra210-sdhci", "nvidia,tegra124-sdhci";
+               reg = <0x0 0x700b0600 0x0 0x200>;
+               interrupts = <GIC_SPI 31 IRQ_TYPE_LEVEL_HIGH>;
+               clocks = <&tegra_car TEGRA210_CLK_SDMMC4>;
+               clock-names = "sdhci";
+               resets = <&tegra_car 15>;
+               reset-names = "sdhci";
+               status = "disabled";
+       };
+
+       mipi: mipi@0,700e3000 {
+               compatible = "nvidia,tegra210-mipi";
+               reg = <0x0 0x700e3000 0x0 0x100>;
+               clocks = <&tegra_car TEGRA210_CLK_MIPI_CAL>;
+               clock-names = "mipi-cal";
+               #nvidia,mipi-calibrate-cells = <1>;
+       };
+
+       spi@0,70410000 {
+               compatible = "nvidia,tegra210-qspi";
+               reg = <0x0 0x70410000 0x0 0x1000>;
+               interrupts = <GIC_SPI 10 IRQ_TYPE_LEVEL_HIGH>;
+               #address-cells = <1>;
+               #size-cells = <0>;
+               clocks = <&tegra_car TEGRA210_CLK_QSPI>;
+               clock-names = "qspi";
+               resets = <&tegra_car 211>;
+               reset-names = "qspi";
+               dmas = <&apbdma 5>, <&apbdma 5>;
+               dma-names = "rx", "tx";
+               status = "disabled";
+       };
+
+       usb@0,7d000000 {
+               compatible = "nvidia,tegra210-ehci", "nvidia,tegra30-ehci", "usb-ehci";
+               reg = <0x0 0x7d000000 0x0 0x4000>;
+               interrupts = <GIC_SPI 20 IRQ_TYPE_LEVEL_HIGH>;
+               phy_type = "utmi";
+               clocks = <&tegra_car TEGRA210_CLK_USBD>;
+               clock-names = "usb";
+               resets = <&tegra_car 22>;
+               reset-names = "usb";
+               nvidia,phy = <&phy1>;
+               status = "disabled";
+       };
+
+       phy1: usb-phy@0,7d000000 {
+               compatible = "nvidia,tegra210-usb-phy", "nvidia,tegra30-usb-phy";
+               reg = <0x0 0x7d000000 0x0 0x4000>,
+                     <0x0 0x7d000000 0x0 0x4000>;
+               phy_type = "utmi";
+               clocks = <&tegra_car TEGRA210_CLK_USBD>,
+                        <&tegra_car TEGRA210_CLK_PLL_U>,
+                        <&tegra_car TEGRA210_CLK_USBD>;
+               clock-names = "reg", "pll_u", "utmi-pads";
+               resets = <&tegra_car 22>, <&tegra_car 22>;
+               reset-names = "usb", "utmi-pads";
+               nvidia,hssync-start-delay = <0>;
+               nvidia,idle-wait-delay = <17>;
+               nvidia,elastic-limit = <16>;
+               nvidia,term-range-adj = <6>;
+               nvidia,xcvr-setup = <9>;
+               nvidia,xcvr-lsfslew = <0>;
+               nvidia,xcvr-lsrslew = <3>;
+               nvidia,hssquelch-level = <2>;
+               nvidia,hsdiscon-level = <5>;
+               nvidia,xcvr-hsslew = <12>;
+               nvidia,has-utmi-pad-registers;
+               status = "disabled";
+       };
+
+       usb@0,7d004000 {
+               compatible = "nvidia,tegra210-ehci", "nvidia,tegra30-ehci", "usb-ehci";
+               reg = <0x0 0x7d004000 0x0 0x4000>;
+               interrupts = <GIC_SPI 21 IRQ_TYPE_LEVEL_HIGH>;
+               phy_type = "utmi";
+               clocks = <&tegra_car TEGRA210_CLK_USB2>;
+               clock-names = "usb";
+               resets = <&tegra_car 58>;
+               reset-names = "usb";
+               nvidia,phy = <&phy2>;
+               status = "disabled";
+       };
+
+       phy2: usb-phy@0,7d004000 {
+               compatible = "nvidia,tegra210-usb-phy", "nvidia,tegra30-usb-phy";
+               reg = <0x0 0x7d004000 0x0 0x4000>,
+                     <0x0 0x7d000000 0x0 0x4000>;
+               phy_type = "utmi";
+               clocks = <&tegra_car TEGRA210_CLK_USB2>,
+                        <&tegra_car TEGRA210_CLK_PLL_U>,
+                        <&tegra_car TEGRA210_CLK_USBD>;
+               clock-names = "reg", "pll_u", "utmi-pads";
+               resets = <&tegra_car 58>, <&tegra_car 22>;
+               reset-names = "usb", "utmi-pads";
+               nvidia,hssync-start-delay = <0>;
+               nvidia,idle-wait-delay = <17>;
+               nvidia,elastic-limit = <16>;
+               nvidia,term-range-adj = <6>;
+               nvidia,xcvr-setup = <9>;
+               nvidia,xcvr-lsfslew = <0>;
+               nvidia,xcvr-lsrslew = <3>;
+               nvidia,hssquelch-level = <2>;
+               nvidia,hsdiscon-level = <5>;
+               nvidia,xcvr-hsslew = <12>;
+               status = "disabled";
+       };
+
+       cpus {
+               #address-cells = <1>;
+               #size-cells = <0>;
+
+               cpu@0 {
+                       device_type = "cpu";
+                       compatible = "arm,cortex-a57";
+                       reg = <0>;
+               };
+
+               cpu@1 {
+                       device_type = "cpu";
+                       compatible = "arm,cortex-a57";
+                       reg = <1>;
+               };
+
+               cpu@2 {
+                       device_type = "cpu";
+                       compatible = "arm,cortex-a57";
+                       reg = <2>;
+               };
+
+               cpu@3 {
+                       device_type = "cpu";
+                       compatible = "arm,cortex-a57";
+                       reg = <3>;
+               };
+       };
+
+       timer {
+               compatible = "arm,armv8-timer";
+               interrupts = <GIC_PPI 13
+                               (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_LOW)>,
+                            <GIC_PPI 14
+                               (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_LOW)>,
+                            <GIC_PPI 11
+                               (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_LOW)>,
+                            <GIC_PPI 10
+                               (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_LOW)>;
+               interrupt-parent = <&gic>;
+       };
+};
index 74c132d901bd290bcc803b7c0a0266428a2ec1cb..6a8685051b676196780cab79a495fc6d906ccce7 100644 (file)
@@ -11,7 +11,7 @@
 
 
 
-#define NR_syscalls                    323 /* length of syscall table */
+#define NR_syscalls                    324 /* length of syscall table */
 
 /*
  * The following defines stop scripts/checksyscalls.sh from complaining about
index 762edce7572eb33f03a3402e4357de69626764c9..41369a103838a6e3d8776f550885399d9006e60e 100644 (file)
 #define __NR_membarrier                        1344
 #define __NR_kcmp                      1345
 #define __NR_mlock2                    1346
+#define __NR_copy_file_range           1347
 
 #endif /* _UAPI_ASM_IA64_UNISTD_H */
index 534a74acb849d736726b5f066675f0a3a4c87ed4..477c55e244ecd45ce879dea27779967a16e0d13c 100644 (file)
@@ -1772,5 +1772,6 @@ sys_call_table:
        data8 sys_membarrier
        data8 sys_kcmp                          // 1345
        data8 sys_mlock2
+       data8 sys_copy_file_range
 
        .org sys_call_table + 8*NR_syscalls     // guard against failures to increase NR_syscalls
index 5038fd578e65acaeeb15127d3af1aa7a75068635..2936a0044c049681d6eeeb80eddbb3ab4ad9295a 100644 (file)
@@ -1799,9 +1799,9 @@ static int spufs_mfc_fsync(struct file *file, loff_t start, loff_t end, int data
        struct inode *inode = file_inode(file);
        int err = filemap_write_and_wait_range(inode->i_mapping, start, end);
        if (!err) {
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
                err = spufs_mfc_flush(file, NULL);
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
        }
        return err;
 }
index ad4840f86be1f53a127ac829b62d2cc41c95ce49..dfa863876778144baa5cb99ac65e61c4ca60ec70 100644 (file)
@@ -163,7 +163,7 @@ static void spufs_prune_dir(struct dentry *dir)
 {
        struct dentry *dentry, *tmp;
 
-       mutex_lock(&d_inode(dir)->i_mutex);
+       inode_lock(d_inode(dir));
        list_for_each_entry_safe(dentry, tmp, &dir->d_subdirs, d_child) {
                spin_lock(&dentry->d_lock);
                if (simple_positive(dentry)) {
@@ -180,7 +180,7 @@ static void spufs_prune_dir(struct dentry *dir)
                }
        }
        shrink_dcache_parent(dir);
-       mutex_unlock(&d_inode(dir)->i_mutex);
+       inode_unlock(d_inode(dir));
 }
 
 /* Caller must hold parent->i_mutex */
@@ -225,9 +225,9 @@ static int spufs_dir_close(struct inode *inode, struct file *file)
        parent = d_inode(dir->d_parent);
        ctx = SPUFS_I(d_inode(dir))->i_ctx;
 
-       mutex_lock_nested(&parent->i_mutex, I_MUTEX_PARENT);
+       inode_lock_nested(parent, I_MUTEX_PARENT);
        ret = spufs_rmdir(parent, dir);
-       mutex_unlock(&parent->i_mutex);
+       inode_unlock(parent);
        WARN_ON(ret);
 
        return dcache_dir_close(inode, file);
@@ -270,7 +270,7 @@ spufs_mkdir(struct inode *dir, struct dentry *dentry, unsigned int flags,
        inode->i_op = &simple_dir_inode_operations;
        inode->i_fop = &simple_dir_operations;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        dget(dentry);
        inc_nlink(dir);
@@ -291,7 +291,7 @@ spufs_mkdir(struct inode *dir, struct dentry *dentry, unsigned int flags,
        if (ret)
                spufs_rmdir(dir, dentry);
 
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        return ret;
 }
index b2e5902bd8f4d8e5f4f53cc3e6fad1cb183db7be..0f3da2cb2bd63c0d26649776fd53d844f3153d18 100644 (file)
@@ -67,7 +67,7 @@ static void hypfs_remove(struct dentry *dentry)
        struct dentry *parent;
 
        parent = dentry->d_parent;
-       mutex_lock(&d_inode(parent)->i_mutex);
+       inode_lock(d_inode(parent));
        if (simple_positive(dentry)) {
                if (d_is_dir(dentry))
                        simple_rmdir(d_inode(parent), dentry);
@@ -76,7 +76,7 @@ static void hypfs_remove(struct dentry *dentry)
        }
        d_delete(dentry);
        dput(dentry);
-       mutex_unlock(&d_inode(parent)->i_mutex);
+       inode_unlock(d_inode(parent));
 }
 
 static void hypfs_delete_tree(struct dentry *root)
@@ -331,7 +331,7 @@ static struct dentry *hypfs_create_file(struct dentry *parent, const char *name,
        struct dentry *dentry;
        struct inode *inode;
 
-       mutex_lock(&d_inode(parent)->i_mutex);
+       inode_lock(d_inode(parent));
        dentry = lookup_one_len(name, parent, strlen(name));
        if (IS_ERR(dentry)) {
                dentry = ERR_PTR(-ENOMEM);
@@ -359,7 +359,7 @@ static struct dentry *hypfs_create_file(struct dentry *parent, const char *name,
        d_instantiate(dentry, inode);
        dget(dentry);
 fail:
-       mutex_unlock(&d_inode(parent)->i_mutex);
+       inode_unlock(d_inode(parent));
        return dentry;
 }
 
index 1544fabcd7f9b7428a5d3ec7429f00f03b545945..c57fd1ea968957cd600ea1032caf1a9f379b38e5 100644 (file)
@@ -67,18 +67,19 @@ static inline void arch_wmb_pmem(void)
 }
 
 /**
- * __arch_wb_cache_pmem - write back a cache range with CLWB
+ * arch_wb_cache_pmem - write back a cache range with CLWB
  * @vaddr:     virtual start address
  * @size:      number of bytes to write back
  *
  * Write back a cache range using the CLWB (cache line write back)
  * instruction.  This function requires explicit ordering with an
- * arch_wmb_pmem() call.  This API is internal to the x86 PMEM implementation.
+ * arch_wmb_pmem() call.
  */
-static inline void __arch_wb_cache_pmem(void *vaddr, size_t size)
+static inline void arch_wb_cache_pmem(void __pmem *addr, size_t size)
 {
        u16 x86_clflush_size = boot_cpu_data.x86_clflush_size;
        unsigned long clflush_mask = x86_clflush_size - 1;
+       void *vaddr = (void __force *)addr;
        void *vend = vaddr + size;
        void *p;
 
@@ -115,7 +116,7 @@ static inline size_t arch_copy_from_iter_pmem(void __pmem *addr, size_t bytes,
        len = copy_from_iter_nocache(vaddr, bytes, i);
 
        if (__iter_needs_pmem_wb(i))
-               __arch_wb_cache_pmem(vaddr, bytes);
+               arch_wb_cache_pmem(addr, bytes);
 
        return len;
 }
@@ -133,7 +134,7 @@ static inline void arch_clear_pmem(void __pmem *addr, size_t size)
        void *vaddr = (void __force *)addr;
 
        memset(vaddr, 0, size);
-       __arch_wb_cache_pmem(vaddr, size);
+       arch_wb_cache_pmem(addr, size);
 }
 
 static inline bool __arch_has_wmb_pmem(void)
index db5f622c9d67e05a4e2a15778c922b367e9cd82b..9eda2322b2d40acfb308a8afa245de04c9b84989 100644 (file)
@@ -5,7 +5,7 @@
 obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
                        blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
                        blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
-                       blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
+                       blk-lib.o blk-mq.o blk-mq-tag.o \
                        blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
                        genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
                        badblocks.o partitions/
index f6325d573c10a42c673f844e4fb9585a0d9bd292..711e4d8de6fa06020432da0f37a9edcdd67a6151 100644 (file)
@@ -66,7 +66,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
        }
 
        if (unlikely(!bip))
-               return NULL;
+               return ERR_PTR(-ENOMEM);
 
        memset(bip, 0, sizeof(*bip));
 
@@ -89,7 +89,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
        return bip;
 err:
        mempool_free(bip, bs->bio_integrity_pool);
-       return NULL;
+       return ERR_PTR(-ENOMEM);
 }
 EXPORT_SYMBOL(bio_integrity_alloc);
 
@@ -298,10 +298,10 @@ int bio_integrity_prep(struct bio *bio)
 
        /* Allocate bio integrity payload and integrity vectors */
        bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages);
-       if (unlikely(bip == NULL)) {
+       if (IS_ERR(bip)) {
                printk(KERN_ERR "could not allocate data integrity bioset\n");
                kfree(buf);
-               return -EIO;
+               return PTR_ERR(bip);
        }
 
        bip->bip_flags |= BIP_BLOCK_INTEGRITY;
@@ -465,9 +465,8 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
        BUG_ON(bip_src == NULL);
 
        bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt);
-
-       if (bip == NULL)
-               return -EIO;
+       if (IS_ERR(bip))
+               return PTR_ERR(bip);
 
        memcpy(bip->bip_vec, bip_src->bip_vec,
               bip_src->bip_vcnt * sizeof(struct bio_vec));
index 476244d5930917401dd6d41a314962a9edc6e6b3..ab51685988c253616f751615ba06206732149de4 100644 (file)
@@ -680,6 +680,13 @@ static void blk_queue_usage_counter_release(struct percpu_ref *ref)
        wake_up_all(&q->mq_freeze_wq);
 }
 
+static void blk_rq_timed_out_timer(unsigned long data)
+{
+       struct request_queue *q = (struct request_queue *)data;
+
+       kblockd_schedule_work(&q->timeout_work);
+}
+
 struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 {
        struct request_queue *q;
@@ -841,6 +848,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
        if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
                goto fail;
 
+       INIT_WORK(&q->timeout_work, blk_timeout_work);
        q->request_fn           = rfn;
        q->prep_rq_fn           = NULL;
        q->unprep_rq_fn         = NULL;
diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c
deleted file mode 100644 (file)
index 0736729..0000000
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Functions related to interrupt-poll handling in the block layer. This
- * is similar to NAPI for network devices.
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/bio.h>
-#include <linux/blkdev.h>
-#include <linux/interrupt.h>
-#include <linux/cpu.h>
-#include <linux/blk-iopoll.h>
-#include <linux/delay.h>
-
-#include "blk.h"
-
-static unsigned int blk_iopoll_budget __read_mostly = 256;
-
-static DEFINE_PER_CPU(struct list_head, blk_cpu_iopoll);
-
-/**
- * blk_iopoll_sched - Schedule a run of the iopoll handler
- * @iop:      The parent iopoll structure
- *
- * Description:
- *     Add this blk_iopoll structure to the pending poll list and trigger the
- *     raise of the blk iopoll softirq. The driver must already have gotten a
- *     successful return from blk_iopoll_sched_prep() before calling this.
- **/
-void blk_iopoll_sched(struct blk_iopoll *iop)
-{
-       unsigned long flags;
-
-       local_irq_save(flags);
-       list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
-       __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
-       local_irq_restore(flags);
-}
-EXPORT_SYMBOL(blk_iopoll_sched);
-
-/**
- * __blk_iopoll_complete - Mark this @iop as un-polled again
- * @iop:      The parent iopoll structure
- *
- * Description:
- *     See blk_iopoll_complete(). This function must be called with interrupts
- *     disabled.
- **/
-void __blk_iopoll_complete(struct blk_iopoll *iop)
-{
-       list_del(&iop->list);
-       smp_mb__before_atomic();
-       clear_bit_unlock(IOPOLL_F_SCHED, &iop->state);
-}
-EXPORT_SYMBOL(__blk_iopoll_complete);
-
-/**
- * blk_iopoll_complete - Mark this @iop as un-polled again
- * @iop:      The parent iopoll structure
- *
- * Description:
- *     If a driver consumes less than the assigned budget in its run of the
- *     iopoll handler, it'll end the polled mode by calling this function. The
- *     iopoll handler will not be invoked again before blk_iopoll_sched_prep()
- *     is called.
- **/
-void blk_iopoll_complete(struct blk_iopoll *iop)
-{
-       unsigned long flags;
-
-       local_irq_save(flags);
-       __blk_iopoll_complete(iop);
-       local_irq_restore(flags);
-}
-EXPORT_SYMBOL(blk_iopoll_complete);
-
-static void blk_iopoll_softirq(struct softirq_action *h)
-{
-       struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll);
-       int rearm = 0, budget = blk_iopoll_budget;
-       unsigned long start_time = jiffies;
-
-       local_irq_disable();
-
-       while (!list_empty(list)) {
-               struct blk_iopoll *iop;
-               int work, weight;
-
-               /*
-                * If softirq window is exhausted then punt.
-                */
-               if (budget <= 0 || time_after(jiffies, start_time)) {
-                       rearm = 1;
-                       break;
-               }
-
-               local_irq_enable();
-
-               /* Even though interrupts have been re-enabled, this
-                * access is safe because interrupts can only add new
-                * entries to the tail of this list, and only ->poll()
-                * calls can remove this head entry from the list.
-                */
-               iop = list_entry(list->next, struct blk_iopoll, list);
-
-               weight = iop->weight;
-               work = 0;
-               if (test_bit(IOPOLL_F_SCHED, &iop->state))
-                       work = iop->poll(iop, weight);
-
-               budget -= work;
-
-               local_irq_disable();
-
-               /*
-                * Drivers must not modify the iopoll state, if they
-                * consume their assigned weight (or more, some drivers can't
-                * easily just stop processing, they have to complete an
-                * entire mask of commands).In such cases this code
-                * still "owns" the iopoll instance and therefore can
-                * move the instance around on the list at-will.
-                */
-               if (work >= weight) {
-                       if (blk_iopoll_disable_pending(iop))
-                               __blk_iopoll_complete(iop);
-                       else
-                               list_move_tail(&iop->list, list);
-               }
-       }
-
-       if (rearm)
-               __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
-
-       local_irq_enable();
-}
-
-/**
- * blk_iopoll_disable - Disable iopoll on this @iop
- * @iop:      The parent iopoll structure
- *
- * Description:
- *     Disable io polling and wait for any pending callbacks to have completed.
- **/
-void blk_iopoll_disable(struct blk_iopoll *iop)
-{
-       set_bit(IOPOLL_F_DISABLE, &iop->state);
-       while (test_and_set_bit(IOPOLL_F_SCHED, &iop->state))
-               msleep(1);
-       clear_bit(IOPOLL_F_DISABLE, &iop->state);
-}
-EXPORT_SYMBOL(blk_iopoll_disable);
-
-/**
- * blk_iopoll_enable - Enable iopoll on this @iop
- * @iop:      The parent iopoll structure
- *
- * Description:
- *     Enable iopoll on this @iop. Note that the handler run will not be
- *     scheduled, it will only mark it as active.
- **/
-void blk_iopoll_enable(struct blk_iopoll *iop)
-{
-       BUG_ON(!test_bit(IOPOLL_F_SCHED, &iop->state));
-       smp_mb__before_atomic();
-       clear_bit_unlock(IOPOLL_F_SCHED, &iop->state);
-}
-EXPORT_SYMBOL(blk_iopoll_enable);
-
-/**
- * blk_iopoll_init - Initialize this @iop
- * @iop:      The parent iopoll structure
- * @weight:   The default weight (or command completion budget)
- * @poll_fn:  The handler to invoke
- *
- * Description:
- *     Initialize this blk_iopoll structure. Before being actively used, the
- *     driver must call blk_iopoll_enable().
- **/
-void blk_iopoll_init(struct blk_iopoll *iop, int weight, blk_iopoll_fn *poll_fn)
-{
-       memset(iop, 0, sizeof(*iop));
-       INIT_LIST_HEAD(&iop->list);
-       iop->weight = weight;
-       iop->poll = poll_fn;
-       set_bit(IOPOLL_F_SCHED, &iop->state);
-}
-EXPORT_SYMBOL(blk_iopoll_init);
-
-static int blk_iopoll_cpu_notify(struct notifier_block *self,
-                                unsigned long action, void *hcpu)
-{
-       /*
-        * If a CPU goes away, splice its entries to the current CPU
-        * and trigger a run of the softirq
-        */
-       if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
-               int cpu = (unsigned long) hcpu;
-
-               local_irq_disable();
-               list_splice_init(&per_cpu(blk_cpu_iopoll, cpu),
-                                this_cpu_ptr(&blk_cpu_iopoll));
-               __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
-               local_irq_enable();
-       }
-
-       return NOTIFY_OK;
-}
-
-static struct notifier_block blk_iopoll_cpu_notifier = {
-       .notifier_call  = blk_iopoll_cpu_notify,
-};
-
-static __init int blk_iopoll_setup(void)
-{
-       int i;
-
-       for_each_possible_cpu(i)
-               INIT_LIST_HEAD(&per_cpu(blk_cpu_iopoll, i));
-
-       open_softirq(BLOCK_IOPOLL_SOFTIRQ, blk_iopoll_softirq);
-       register_hotcpu_notifier(&blk_iopoll_cpu_notifier);
-       return 0;
-}
-subsys_initcall(blk_iopoll_setup);
index 6889d7183a2ac1ec31d4132b03647191f8e4ed3a..4c0622fae41383d0f5577ea9b8b127d93df33bb6 100644 (file)
@@ -603,8 +603,6 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
                        blk_mq_complete_request(rq, -EIO);
                return;
        }
-       if (rq->cmd_flags & REQ_NO_TIMEOUT)
-               return;
 
        if (time_after_eq(jiffies, rq->deadline)) {
                if (!blk_mark_rq_complete(rq))
@@ -615,15 +613,19 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
        }
 }
 
-static void blk_mq_rq_timer(unsigned long priv)
+static void blk_mq_timeout_work(struct work_struct *work)
 {
-       struct request_queue *q = (struct request_queue *)priv;
+       struct request_queue *q =
+               container_of(work, struct request_queue, timeout_work);
        struct blk_mq_timeout_data data = {
                .next           = 0,
                .next_set       = 0,
        };
        int i;
 
+       if (blk_queue_enter(q, true))
+               return;
+
        blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
 
        if (data.next_set) {
@@ -638,6 +640,7 @@ static void blk_mq_rq_timer(unsigned long priv)
                                blk_mq_tag_idle(hctx);
                }
        }
+       blk_queue_exit(q);
 }
 
 /*
@@ -2008,7 +2011,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
                hctxs[i]->queue_num = i;
        }
 
-       setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
+       INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
        blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
 
        q->nr_queues = nr_cpu_ids;
index 3610af5617488a9dbfec02511972c8f52d75f20f..a30441a200c0952eec31f6694dd05641a54227fa 100644 (file)
@@ -127,13 +127,16 @@ static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout
        }
 }
 
-void blk_rq_timed_out_timer(unsigned long data)
+void blk_timeout_work(struct work_struct *work)
 {
-       struct request_queue *q = (struct request_queue *) data;
+       struct request_queue *q =
+               container_of(work, struct request_queue, timeout_work);
        unsigned long flags, next = 0;
        struct request *rq, *tmp;
        int next_set = 0;
 
+       if (blk_queue_enter(q, true))
+               return;
        spin_lock_irqsave(q->queue_lock, flags);
 
        list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list)
@@ -143,6 +146,7 @@ void blk_rq_timed_out_timer(unsigned long data)
                mod_timer(&q->timeout, round_jiffies_up(next));
 
        spin_unlock_irqrestore(q->queue_lock, flags);
+       blk_queue_exit(q);
 }
 
 /**
@@ -193,9 +197,6 @@ void blk_add_timer(struct request *req)
        struct request_queue *q = req->q;
        unsigned long expiry;
 
-       if (req->cmd_flags & REQ_NO_TIMEOUT)
-               return;
-
        /* blk-mq has its own handler, so we don't need ->rq_timed_out_fn */
        if (!q->mq_ops && !q->rq_timed_out_fn)
                return;
index c43926d3d74d0578216d1a699b1dfefff2a899df..70e4aee9cdcb6ead5974dce6a5abf454344b5b24 100644 (file)
@@ -93,7 +93,7 @@ static inline void blk_flush_integrity(void)
 }
 #endif
 
-void blk_rq_timed_out_timer(unsigned long data);
+void blk_timeout_work(struct work_struct *work);
 unsigned long blk_rq_timeout(unsigned long timeout);
 void blk_add_timer(struct request *req);
 void blk_delete_timer(struct request *);
index 2c84683aada56e4b52c9080b88ce99eb1d7b68c8..77f5d17779d6875d0595f23410cbd61116491450 100644 (file)
@@ -455,12 +455,12 @@ static int blkdev_daxset(struct block_device *bdev, unsigned long argp)
        if (arg && !blkdev_dax_capable(bdev))
                return -ENOTTY;
 
-       mutex_lock(&bdev->bd_inode->i_mutex);
+       inode_lock(bdev->bd_inode);
        if (bdev->bd_map_count == 0)
                inode_set_flags(bdev->bd_inode, arg, S_DAX);
        else
                rc = -EBUSY;
-       mutex_unlock(&bdev->bd_inode->i_mutex);
+       inode_unlock(bdev->bd_inode);
        return rc;
 }
 #else
index a8e7aa3e257bbc3c6254b82d2966b9fde3031184..f5e18c2a48527bb3f5bbdc5202b37577689710b3 100644 (file)
@@ -76,6 +76,8 @@ int af_alg_register_type(const struct af_alg_type *type)
                goto unlock;
 
        type->ops->owner = THIS_MODULE;
+       if (type->ops_nokey)
+               type->ops_nokey->owner = THIS_MODULE;
        node->type = type;
        list_add(&node->list, &alg_types);
        err = 0;
@@ -125,6 +127,26 @@ int af_alg_release(struct socket *sock)
 }
 EXPORT_SYMBOL_GPL(af_alg_release);
 
+void af_alg_release_parent(struct sock *sk)
+{
+       struct alg_sock *ask = alg_sk(sk);
+       unsigned int nokey = ask->nokey_refcnt;
+       bool last = nokey && !ask->refcnt;
+
+       sk = ask->parent;
+       ask = alg_sk(sk);
+
+       lock_sock(sk);
+       ask->nokey_refcnt -= nokey;
+       if (!last)
+               last = !--ask->refcnt;
+       release_sock(sk);
+
+       if (last)
+               sock_put(sk);
+}
+EXPORT_SYMBOL_GPL(af_alg_release_parent);
+
 static int alg_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 {
        const u32 forbidden = CRYPTO_ALG_INTERNAL;
@@ -133,6 +155,7 @@ static int alg_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
        struct sockaddr_alg *sa = (void *)uaddr;
        const struct af_alg_type *type;
        void *private;
+       int err;
 
        if (sock->state == SS_CONNECTED)
                return -EINVAL;
@@ -160,16 +183,22 @@ static int alg_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
                return PTR_ERR(private);
        }
 
+       err = -EBUSY;
        lock_sock(sk);
+       if (ask->refcnt | ask->nokey_refcnt)
+               goto unlock;
 
        swap(ask->type, type);
        swap(ask->private, private);
 
+       err = 0;
+
+unlock:
        release_sock(sk);
 
        alg_do_release(type, private);
 
-       return 0;
+       return err;
 }
 
 static int alg_setkey(struct sock *sk, char __user *ukey,
@@ -202,11 +231,15 @@ static int alg_setsockopt(struct socket *sock, int level, int optname,
        struct sock *sk = sock->sk;
        struct alg_sock *ask = alg_sk(sk);
        const struct af_alg_type *type;
-       int err = -ENOPROTOOPT;
+       int err = -EBUSY;
 
        lock_sock(sk);
+       if (ask->refcnt)
+               goto unlock;
+
        type = ask->type;
 
+       err = -ENOPROTOOPT;
        if (level != SOL_ALG || !type)
                goto unlock;
 
@@ -238,6 +271,7 @@ int af_alg_accept(struct sock *sk, struct socket *newsock)
        struct alg_sock *ask = alg_sk(sk);
        const struct af_alg_type *type;
        struct sock *sk2;
+       unsigned int nokey;
        int err;
 
        lock_sock(sk);
@@ -257,20 +291,29 @@ int af_alg_accept(struct sock *sk, struct socket *newsock)
        security_sk_clone(sk, sk2);
 
        err = type->accept(ask->private, sk2);
-       if (err) {
-               sk_free(sk2);
+
+       nokey = err == -ENOKEY;
+       if (nokey && type->accept_nokey)
+               err = type->accept_nokey(ask->private, sk2);
+
+       if (err)
                goto unlock;
-       }
 
        sk2->sk_family = PF_ALG;
 
-       sock_hold(sk);
+       if (nokey || !ask->refcnt++)
+               sock_hold(sk);
+       ask->nokey_refcnt += nokey;
        alg_sk(sk2)->parent = sk;
        alg_sk(sk2)->type = type;
+       alg_sk(sk2)->nokey_refcnt = nokey;
 
        newsock->ops = type->ops;
        newsock->state = SS_CONNECTED;
 
+       if (nokey)
+               newsock->ops = type->ops_nokey;
+
        err = 0;
 
 unlock:
index 9c1dc8d6106a89a0f853271c1dfc49cd301ec983..d19b52324cf520ee777743ee895efb2f537f5e62 100644 (file)
@@ -451,6 +451,7 @@ static int crypto_ahash_init_tfm(struct crypto_tfm *tfm)
        struct ahash_alg *alg = crypto_ahash_alg(hash);
 
        hash->setkey = ahash_nosetkey;
+       hash->has_setkey = false;
        hash->export = ahash_no_export;
        hash->import = ahash_no_import;
 
@@ -463,8 +464,10 @@ static int crypto_ahash_init_tfm(struct crypto_tfm *tfm)
        hash->finup = alg->finup ?: ahash_def_finup;
        hash->digest = alg->digest;
 
-       if (alg->setkey)
+       if (alg->setkey) {
                hash->setkey = alg->setkey;
+               hash->has_setkey = true;
+       }
        if (alg->export)
                hash->export = alg->export;
        if (alg->import)
index b4c24fe3dcfb5ed575350406c9dd7ece3c006aef..608a7562839d2fe549fbebd8714733436ed2c481 100644 (file)
@@ -34,6 +34,11 @@ struct hash_ctx {
        struct ahash_request req;
 };
 
+struct algif_hash_tfm {
+       struct crypto_ahash *hash;
+       bool has_key;
+};
+
 static int hash_sendmsg(struct socket *sock, struct msghdr *msg,
                        size_t ignored)
 {
@@ -235,19 +240,151 @@ static struct proto_ops algif_hash_ops = {
        .accept         =       hash_accept,
 };
 
+static int hash_check_key(struct socket *sock)
+{
+       int err = 0;
+       struct sock *psk;
+       struct alg_sock *pask;
+       struct algif_hash_tfm *tfm;
+       struct sock *sk = sock->sk;
+       struct alg_sock *ask = alg_sk(sk);
+
+       lock_sock(sk);
+       if (ask->refcnt)
+               goto unlock_child;
+
+       psk = ask->parent;
+       pask = alg_sk(ask->parent);
+       tfm = pask->private;
+
+       err = -ENOKEY;
+       lock_sock_nested(psk, SINGLE_DEPTH_NESTING);
+       if (!tfm->has_key)
+               goto unlock;
+
+       if (!pask->refcnt++)
+               sock_hold(psk);
+
+       ask->refcnt = 1;
+       sock_put(psk);
+
+       err = 0;
+
+unlock:
+       release_sock(psk);
+unlock_child:
+       release_sock(sk);
+
+       return err;
+}
+
+static int hash_sendmsg_nokey(struct socket *sock, struct msghdr *msg,
+                             size_t size)
+{
+       int err;
+
+       err = hash_check_key(sock);
+       if (err)
+               return err;
+
+       return hash_sendmsg(sock, msg, size);
+}
+
+static ssize_t hash_sendpage_nokey(struct socket *sock, struct page *page,
+                                  int offset, size_t size, int flags)
+{
+       int err;
+
+       err = hash_check_key(sock);
+       if (err)
+               return err;
+
+       return hash_sendpage(sock, page, offset, size, flags);
+}
+
+static int hash_recvmsg_nokey(struct socket *sock, struct msghdr *msg,
+                             size_t ignored, int flags)
+{
+       int err;
+
+       err = hash_check_key(sock);
+       if (err)
+               return err;
+
+       return hash_recvmsg(sock, msg, ignored, flags);
+}
+
+static int hash_accept_nokey(struct socket *sock, struct socket *newsock,
+                            int flags)
+{
+       int err;
+
+       err = hash_check_key(sock);
+       if (err)
+               return err;
+
+       return hash_accept(sock, newsock, flags);
+}
+
+static struct proto_ops algif_hash_ops_nokey = {
+       .family         =       PF_ALG,
+
+       .connect        =       sock_no_connect,
+       .socketpair     =       sock_no_socketpair,
+       .getname        =       sock_no_getname,
+       .ioctl          =       sock_no_ioctl,
+       .listen         =       sock_no_listen,
+       .shutdown       =       sock_no_shutdown,
+       .getsockopt     =       sock_no_getsockopt,
+       .mmap           =       sock_no_mmap,
+       .bind           =       sock_no_bind,
+       .setsockopt     =       sock_no_setsockopt,
+       .poll           =       sock_no_poll,
+
+       .release        =       af_alg_release,
+       .sendmsg        =       hash_sendmsg_nokey,
+       .sendpage       =       hash_sendpage_nokey,
+       .recvmsg        =       hash_recvmsg_nokey,
+       .accept         =       hash_accept_nokey,
+};
+
 static void *hash_bind(const char *name, u32 type, u32 mask)
 {
-       return crypto_alloc_ahash(name, type, mask);
+       struct algif_hash_tfm *tfm;
+       struct crypto_ahash *hash;
+
+       tfm = kzalloc(sizeof(*tfm), GFP_KERNEL);
+       if (!tfm)
+               return ERR_PTR(-ENOMEM);
+
+       hash = crypto_alloc_ahash(name, type, mask);
+       if (IS_ERR(hash)) {
+               kfree(tfm);
+               return ERR_CAST(hash);
+       }
+
+       tfm->hash = hash;
+
+       return tfm;
 }
 
 static void hash_release(void *private)
 {
-       crypto_free_ahash(private);
+       struct algif_hash_tfm *tfm = private;
+
+       crypto_free_ahash(tfm->hash);
+       kfree(tfm);
 }
 
 static int hash_setkey(void *private, const u8 *key, unsigned int keylen)
 {
-       return crypto_ahash_setkey(private, key, keylen);
+       struct algif_hash_tfm *tfm = private;
+       int err;
+
+       err = crypto_ahash_setkey(tfm->hash, key, keylen);
+       tfm->has_key = !err;
+
+       return err;
 }
 
 static void hash_sock_destruct(struct sock *sk)
@@ -261,12 +398,14 @@ static void hash_sock_destruct(struct sock *sk)
        af_alg_release_parent(sk);
 }
 
-static int hash_accept_parent(void *private, struct sock *sk)
+static int hash_accept_parent_nokey(void *private, struct sock *sk)
 {
        struct hash_ctx *ctx;
        struct alg_sock *ask = alg_sk(sk);
-       unsigned len = sizeof(*ctx) + crypto_ahash_reqsize(private);
-       unsigned ds = crypto_ahash_digestsize(private);
+       struct algif_hash_tfm *tfm = private;
+       struct crypto_ahash *hash = tfm->hash;
+       unsigned len = sizeof(*ctx) + crypto_ahash_reqsize(hash);
+       unsigned ds = crypto_ahash_digestsize(hash);
 
        ctx = sock_kmalloc(sk, len, GFP_KERNEL);
        if (!ctx)
@@ -286,7 +425,7 @@ static int hash_accept_parent(void *private, struct sock *sk)
 
        ask->private = ctx;
 
-       ahash_request_set_tfm(&ctx->req, private);
+       ahash_request_set_tfm(&ctx->req, hash);
        ahash_request_set_callback(&ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG,
                                   af_alg_complete, &ctx->completion);
 
@@ -295,12 +434,24 @@ static int hash_accept_parent(void *private, struct sock *sk)
        return 0;
 }
 
+static int hash_accept_parent(void *private, struct sock *sk)
+{
+       struct algif_hash_tfm *tfm = private;
+
+       if (!tfm->has_key && crypto_ahash_has_setkey(tfm->hash))
+               return -ENOKEY;
+
+       return hash_accept_parent_nokey(private, sk);
+}
+
 static const struct af_alg_type algif_type_hash = {
        .bind           =       hash_bind,
        .release        =       hash_release,
        .setkey         =       hash_setkey,
        .accept         =       hash_accept_parent,
+       .accept_nokey   =       hash_accept_parent_nokey,
        .ops            =       &algif_hash_ops,
+       .ops_nokey      =       &algif_hash_ops_nokey,
        .name           =       "hash",
        .owner          =       THIS_MODULE
 };
index eaa9f9be5b87824446c20f07d6bef14114e866b3..38c1aa89d3a0a7c4448bcd094ca1f268b9156a2d 100644 (file)
@@ -31,6 +31,11 @@ struct skcipher_sg_list {
        struct scatterlist sg[0];
 };
 
+struct skcipher_tfm {
+       struct crypto_skcipher *skcipher;
+       bool has_key;
+};
+
 struct skcipher_ctx {
        struct list_head tsgl;
        struct af_alg_sgl rsgl;
@@ -387,7 +392,8 @@ static int skcipher_sendmsg(struct socket *sock, struct msghdr *msg,
 
                sgl = list_entry(ctx->tsgl.prev, struct skcipher_sg_list, list);
                sg = sgl->sg;
-               sg_unmark_end(sg + sgl->cur);
+               if (sgl->cur)
+                       sg_unmark_end(sg + sgl->cur - 1);
                do {
                        i = sgl->cur;
                        plen = min_t(size_t, len, PAGE_SIZE);
@@ -642,13 +648,6 @@ static int skcipher_recvmsg_sync(struct socket *sock, struct msghdr *msg,
 
        lock_sock(sk);
        while (msg_data_left(msg)) {
-               sgl = list_first_entry(&ctx->tsgl,
-                                      struct skcipher_sg_list, list);
-               sg = sgl->sg;
-
-               while (!sg->length)
-                       sg++;
-
                if (!ctx->used) {
                        err = skcipher_wait_for_data(sk, flags);
                        if (err)
@@ -669,6 +668,13 @@ static int skcipher_recvmsg_sync(struct socket *sock, struct msghdr *msg,
                if (!used)
                        goto free;
 
+               sgl = list_first_entry(&ctx->tsgl,
+                                      struct skcipher_sg_list, list);
+               sg = sgl->sg;
+
+               while (!sg->length)
+                       sg++;
+
                skcipher_request_set_crypt(&ctx->req, sg, ctx->rsgl.sg, used,
                                           ctx->iv);
 
@@ -748,19 +754,139 @@ static struct proto_ops algif_skcipher_ops = {
        .poll           =       skcipher_poll,
 };
 
+static int skcipher_check_key(struct socket *sock)
+{
+       int err = 0;
+       struct sock *psk;
+       struct alg_sock *pask;
+       struct skcipher_tfm *tfm;
+       struct sock *sk = sock->sk;
+       struct alg_sock *ask = alg_sk(sk);
+
+       lock_sock(sk);
+       if (ask->refcnt)
+               goto unlock_child;
+
+       psk = ask->parent;
+       pask = alg_sk(ask->parent);
+       tfm = pask->private;
+
+       err = -ENOKEY;
+       lock_sock_nested(psk, SINGLE_DEPTH_NESTING);
+       if (!tfm->has_key)
+               goto unlock;
+
+       if (!pask->refcnt++)
+               sock_hold(psk);
+
+       ask->refcnt = 1;
+       sock_put(psk);
+
+       err = 0;
+
+unlock:
+       release_sock(psk);
+unlock_child:
+       release_sock(sk);
+
+       return err;
+}
+
+static int skcipher_sendmsg_nokey(struct socket *sock, struct msghdr *msg,
+                                 size_t size)
+{
+       int err;
+
+       err = skcipher_check_key(sock);
+       if (err)
+               return err;
+
+       return skcipher_sendmsg(sock, msg, size);
+}
+
+static ssize_t skcipher_sendpage_nokey(struct socket *sock, struct page *page,
+                                      int offset, size_t size, int flags)
+{
+       int err;
+
+       err = skcipher_check_key(sock);
+       if (err)
+               return err;
+
+       return skcipher_sendpage(sock, page, offset, size, flags);
+}
+
+static int skcipher_recvmsg_nokey(struct socket *sock, struct msghdr *msg,
+                                 size_t ignored, int flags)
+{
+       int err;
+
+       err = skcipher_check_key(sock);
+       if (err)
+               return err;
+
+       return skcipher_recvmsg(sock, msg, ignored, flags);
+}
+
+static struct proto_ops algif_skcipher_ops_nokey = {
+       .family         =       PF_ALG,
+
+       .connect        =       sock_no_connect,
+       .socketpair     =       sock_no_socketpair,
+       .getname        =       sock_no_getname,
+       .ioctl          =       sock_no_ioctl,
+       .listen         =       sock_no_listen,
+       .shutdown       =       sock_no_shutdown,
+       .getsockopt     =       sock_no_getsockopt,
+       .mmap           =       sock_no_mmap,
+       .bind           =       sock_no_bind,
+       .accept         =       sock_no_accept,
+       .setsockopt     =       sock_no_setsockopt,
+
+       .release        =       af_alg_release,
+       .sendmsg        =       skcipher_sendmsg_nokey,
+       .sendpage       =       skcipher_sendpage_nokey,
+       .recvmsg        =       skcipher_recvmsg_nokey,
+       .poll           =       skcipher_poll,
+};
+
 static void *skcipher_bind(const char *name, u32 type, u32 mask)
 {
-       return crypto_alloc_skcipher(name, type, mask);
+       struct skcipher_tfm *tfm;
+       struct crypto_skcipher *skcipher;
+
+       tfm = kzalloc(sizeof(*tfm), GFP_KERNEL);
+       if (!tfm)
+               return ERR_PTR(-ENOMEM);
+
+       skcipher = crypto_alloc_skcipher(name, type, mask);
+       if (IS_ERR(skcipher)) {
+               kfree(tfm);
+               return ERR_CAST(skcipher);
+       }
+
+       tfm->skcipher = skcipher;
+
+       return tfm;
 }
 
 static void skcipher_release(void *private)
 {
-       crypto_free_skcipher(private);
+       struct skcipher_tfm *tfm = private;
+
+       crypto_free_skcipher(tfm->skcipher);
+       kfree(tfm);
 }
 
 static int skcipher_setkey(void *private, const u8 *key, unsigned int keylen)
 {
-       return crypto_skcipher_setkey(private, key, keylen);
+       struct skcipher_tfm *tfm = private;
+       int err;
+
+       err = crypto_skcipher_setkey(tfm->skcipher, key, keylen);
+       tfm->has_key = !err;
+
+       return err;
 }
 
 static void skcipher_wait(struct sock *sk)
@@ -788,24 +914,26 @@ static void skcipher_sock_destruct(struct sock *sk)
        af_alg_release_parent(sk);
 }
 
-static int skcipher_accept_parent(void *private, struct sock *sk)
+static int skcipher_accept_parent_nokey(void *private, struct sock *sk)
 {
        struct skcipher_ctx *ctx;
        struct alg_sock *ask = alg_sk(sk);
-       unsigned int len = sizeof(*ctx) + crypto_skcipher_reqsize(private);
+       struct skcipher_tfm *tfm = private;
+       struct crypto_skcipher *skcipher = tfm->skcipher;
+       unsigned int len = sizeof(*ctx) + crypto_skcipher_reqsize(skcipher);
 
        ctx = sock_kmalloc(sk, len, GFP_KERNEL);
        if (!ctx)
                return -ENOMEM;
 
-       ctx->iv = sock_kmalloc(sk, crypto_skcipher_ivsize(private),
+       ctx->iv = sock_kmalloc(sk, crypto_skcipher_ivsize(skcipher),
                               GFP_KERNEL);
        if (!ctx->iv) {
                sock_kfree_s(sk, ctx, len);
                return -ENOMEM;
        }
 
-       memset(ctx->iv, 0, crypto_skcipher_ivsize(private));
+       memset(ctx->iv, 0, crypto_skcipher_ivsize(skcipher));
 
        INIT_LIST_HEAD(&ctx->tsgl);
        ctx->len = len;
@@ -818,7 +946,7 @@ static int skcipher_accept_parent(void *private, struct sock *sk)
 
        ask->private = ctx;
 
-       skcipher_request_set_tfm(&ctx->req, private);
+       skcipher_request_set_tfm(&ctx->req, skcipher);
        skcipher_request_set_callback(&ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG,
                                      af_alg_complete, &ctx->completion);
 
@@ -827,12 +955,24 @@ static int skcipher_accept_parent(void *private, struct sock *sk)
        return 0;
 }
 
+static int skcipher_accept_parent(void *private, struct sock *sk)
+{
+       struct skcipher_tfm *tfm = private;
+
+       if (!tfm->has_key && crypto_skcipher_has_setkey(tfm->skcipher))
+               return -ENOKEY;
+
+       return skcipher_accept_parent_nokey(private, sk);
+}
+
 static const struct af_alg_type algif_type_skcipher = {
        .bind           =       skcipher_bind,
        .release        =       skcipher_release,
        .setkey         =       skcipher_setkey,
        .accept         =       skcipher_accept_parent,
+       .accept_nokey   =       skcipher_accept_parent_nokey,
        .ops            =       &algif_skcipher_ops,
+       .ops_nokey      =       &algif_skcipher_ops_nokey,
        .name           =       "skcipher",
        .owner          =       THIS_MODULE
 };
index 06f1b60f02b223eeea70c000ec4055c470d4eeae..4c0a0e27187694c6d0d7281c1d61b2f25557f269 100644 (file)
@@ -172,4 +172,3 @@ MODULE_DESCRIPTION("CRC32c (Castagnoli) calculations wrapper for lib/crc32c");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_CRYPTO("crc32c");
 MODULE_ALIAS_CRYPTO("crc32c-generic");
-MODULE_SOFTDEP("pre: crc32c");
index ecb1e3d39bf0776a0d8805a9f8fadd3ec288bcb0..88a27de798480cc01eb0220cdf1f0002ddd897e4 100644 (file)
@@ -355,8 +355,10 @@ int crypto_init_shash_ops_async(struct crypto_tfm *tfm)
        crt->finup = shash_async_finup;
        crt->digest = shash_async_digest;
 
-       if (alg->setkey)
+       if (alg->setkey) {
                crt->setkey = shash_async_setkey;
+               crt->has_setkey = true;
+       }
        if (alg->export)
                crt->export = shash_async_export;
        if (alg->import)
index 7591928be7ca789cfc909b3ea300fb6f2fd2cb6f..d199c0b1751c91cbcc5a978aadb97cdf609e33ab 100644 (file)
@@ -118,6 +118,7 @@ static int crypto_init_skcipher_ops_blkcipher(struct crypto_tfm *tfm)
        skcipher->decrypt = skcipher_decrypt_blkcipher;
 
        skcipher->ivsize = crypto_blkcipher_ivsize(blkcipher);
+       skcipher->has_setkey = calg->cra_blkcipher.max_keysize;
 
        return 0;
 }
@@ -210,6 +211,7 @@ static int crypto_init_skcipher_ops_ablkcipher(struct crypto_tfm *tfm)
        skcipher->ivsize = crypto_ablkcipher_ivsize(ablkcipher);
        skcipher->reqsize = crypto_ablkcipher_reqsize(ablkcipher) +
                            sizeof(struct ablkcipher_request);
+       skcipher->has_setkey = calg->cra_ablkcipher.max_keysize;
 
        return 0;
 }
index 6682c5daf742637024c1f72a19f7de750cb1bbe1..6e6bc1059301745fc1d5fd6ebf943d048082e821 100644 (file)
@@ -32,6 +32,7 @@
 #include <linux/hardirq.h>
 #include <linux/pstore.h>
 #include <linux/vmalloc.h>
+#include <linux/mm.h> /* kvfree() */
 #include <acpi/apei.h>
 
 #include "apei-internal.h"
@@ -532,10 +533,7 @@ retry:
                        return -ENOMEM;
                memcpy(new_entries, entries,
                       erst_record_id_cache.len * sizeof(entries[0]));
-               if (erst_record_id_cache.size < PAGE_SIZE)
-                       kfree(entries);
-               else
-                       vfree(entries);
+               kvfree(entries);
                erst_record_id_cache.entries = entries = new_entries;
                erst_record_id_cache.size = new_size;
        }
index 4a5c9d27905900f7bdc1d978f3ef89116e1d83c9..294ba6f363960617c3d1ac2e6191c81e4b384372 100644 (file)
@@ -4,7 +4,7 @@ config ARM_AMBA
 if ARM_AMBA
 
 config TEGRA_AHB
-       bool "Enable AHB driver for NVIDIA Tegra SoCs"
+       bool
        default y if ARCH_TEGRA
        help
          Adds AHB configuration functionality for NVIDIA Tegra SoCs,
index 68f03141e432a2c5a03358d59ed541ac9ae79529..44a74cf1372c6e710d2b102ac51dd34ee44ffd0e 100644 (file)
@@ -215,9 +215,9 @@ static int handle_create(const char *nodename, umode_t mode, kuid_t uid,
                newattrs.ia_uid = uid;
                newattrs.ia_gid = gid;
                newattrs.ia_valid = ATTR_MODE|ATTR_UID|ATTR_GID;
-               mutex_lock(&d_inode(dentry)->i_mutex);
+               inode_lock(d_inode(dentry));
                notify_change(dentry, &newattrs, NULL);
-               mutex_unlock(&d_inode(dentry)->i_mutex);
+               inode_unlock(d_inode(dentry));
 
                /* mark as kernel-created inode */
                d_inode(dentry)->i_private = &thread;
@@ -244,7 +244,7 @@ static int dev_rmdir(const char *name)
                err = -ENOENT;
        }
        dput(dentry);
-       mutex_unlock(&d_inode(parent.dentry)->i_mutex);
+       inode_unlock(d_inode(parent.dentry));
        path_put(&parent);
        return err;
 }
@@ -321,9 +321,9 @@ static int handle_remove(const char *nodename, struct device *dev)
                        newattrs.ia_mode = stat.mode & ~0777;
                        newattrs.ia_valid =
                                ATTR_UID|ATTR_GID|ATTR_MODE;
-                       mutex_lock(&d_inode(dentry)->i_mutex);
+                       inode_lock(d_inode(dentry));
                        notify_change(dentry, &newattrs, NULL);
-                       mutex_unlock(&d_inode(dentry)->i_mutex);
+                       inode_unlock(d_inode(dentry));
                        err = vfs_unlink(d_inode(parent.dentry), dentry, NULL);
                        if (!err || err == -ENOENT)
                                deleted = 1;
@@ -332,7 +332,7 @@ static int handle_remove(const char *nodename, struct device *dev)
                err = -ENOENT;
        }
        dput(dentry);
-       mutex_unlock(&d_inode(parent.dentry)->i_mutex);
+       inode_unlock(d_inode(parent.dentry));
 
        path_put(&parent);
        if (deleted && strchr(nodename, '/'))
index ad80c85e0857064c1ed3cd6420a1ef582ad76c4a..d048d2009e897a7e76d523ec704b1fd1f77dd76b 100644 (file)
@@ -964,9 +964,9 @@ aoecmd_sleepwork(struct work_struct *work)
                ssize = get_capacity(d->gd);
                bd = bdget_disk(d->gd, 0);
                if (bd) {
-                       mutex_lock(&bd->bd_inode->i_mutex);
+                       inode_lock(bd->bd_inode);
                        i_size_write(bd->bd_inode, (loff_t)ssize<<9);
-                       mutex_unlock(&bd->bd_inode->i_mutex);
+                       inode_unlock(bd->bd_inode);
                        bdput(bd);
                }
                spin_lock_irq(&d->lock);
index b3868e7a1ffddbb57e7165a65b6102d872ca4983..10459a14506224ede01f9bbca97be127397142cd 100644 (file)
@@ -288,7 +288,162 @@ bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *
        return need_transaction;
 }
 
-static int al_write_transaction(struct drbd_device *device);
+#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)
+/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT
+ * are still coupled, or assume too much about their relation.
+ * Code below will not work if this is violated.
+ * Will be cleaned up with some followup patch.
+ */
+# error FIXME
+#endif
+
+static unsigned int al_extent_to_bm_page(unsigned int al_enr)
+{
+       return al_enr >>
+               /* bit to page */
+               ((PAGE_SHIFT + 3) -
+               /* al extent number to bit */
+                (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT));
+}
+
+static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device)
+{
+       const unsigned int stripes = device->ldev->md.al_stripes;
+       const unsigned int stripe_size_4kB = device->ldev->md.al_stripe_size_4k;
+
+       /* transaction number, modulo on-disk ring buffer wrap around */
+       unsigned int t = device->al_tr_number % (device->ldev->md.al_size_4k);
+
+       /* ... to aligned 4k on disk block */
+       t = ((t % stripes) * stripe_size_4kB) + t/stripes;
+
+       /* ... to 512 byte sector in activity log */
+       t *= 8;
+
+       /* ... plus offset to the on disk position */
+       return device->ldev->md.md_offset + device->ldev->md.al_offset + t;
+}
+
+static int __al_write_transaction(struct drbd_device *device, struct al_transaction_on_disk *buffer)
+{
+       struct lc_element *e;
+       sector_t sector;
+       int i, mx;
+       unsigned extent_nr;
+       unsigned crc = 0;
+       int err = 0;
+
+       memset(buffer, 0, sizeof(*buffer));
+       buffer->magic = cpu_to_be32(DRBD_AL_MAGIC);
+       buffer->tr_number = cpu_to_be32(device->al_tr_number);
+
+       i = 0;
+
+       /* Even though no one can start to change this list
+        * once we set the LC_LOCKED -- from drbd_al_begin_io(),
+        * lc_try_lock_for_transaction() --, someone may still
+        * be in the process of changing it. */
+       spin_lock_irq(&device->al_lock);
+       list_for_each_entry(e, &device->act_log->to_be_changed, list) {
+               if (i == AL_UPDATES_PER_TRANSACTION) {
+                       i++;
+                       break;
+               }
+               buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index);
+               buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number);
+               if (e->lc_number != LC_FREE)
+                       drbd_bm_mark_for_writeout(device,
+                                       al_extent_to_bm_page(e->lc_number));
+               i++;
+       }
+       spin_unlock_irq(&device->al_lock);
+       BUG_ON(i > AL_UPDATES_PER_TRANSACTION);
+
+       buffer->n_updates = cpu_to_be16(i);
+       for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) {
+               buffer->update_slot_nr[i] = cpu_to_be16(-1);
+               buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE);
+       }
+
+       buffer->context_size = cpu_to_be16(device->act_log->nr_elements);
+       buffer->context_start_slot_nr = cpu_to_be16(device->al_tr_cycle);
+
+       mx = min_t(int, AL_CONTEXT_PER_TRANSACTION,
+                  device->act_log->nr_elements - device->al_tr_cycle);
+       for (i = 0; i < mx; i++) {
+               unsigned idx = device->al_tr_cycle + i;
+               extent_nr = lc_element_by_index(device->act_log, idx)->lc_number;
+               buffer->context[i] = cpu_to_be32(extent_nr);
+       }
+       for (; i < AL_CONTEXT_PER_TRANSACTION; i++)
+               buffer->context[i] = cpu_to_be32(LC_FREE);
+
+       device->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION;
+       if (device->al_tr_cycle >= device->act_log->nr_elements)
+               device->al_tr_cycle = 0;
+
+       sector = al_tr_number_to_on_disk_sector(device);
+
+       crc = crc32c(0, buffer, 4096);
+       buffer->crc32c = cpu_to_be32(crc);
+
+       if (drbd_bm_write_hinted(device))
+               err = -EIO;
+       else {
+               bool write_al_updates;
+               rcu_read_lock();
+               write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates;
+               rcu_read_unlock();
+               if (write_al_updates) {
+                       if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) {
+                               err = -EIO;
+                               drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
+                       } else {
+                               device->al_tr_number++;
+                               device->al_writ_cnt++;
+                       }
+               }
+       }
+
+       return err;
+}
+
+static int al_write_transaction(struct drbd_device *device)
+{
+       struct al_transaction_on_disk *buffer;
+       int err;
+
+       if (!get_ldev(device)) {
+               drbd_err(device, "disk is %s, cannot start al transaction\n",
+                       drbd_disk_str(device->state.disk));
+               return -EIO;
+       }
+
+       /* The bitmap write may have failed, causing a state change. */
+       if (device->state.disk < D_INCONSISTENT) {
+               drbd_err(device,
+                       "disk is %s, cannot write al transaction\n",
+                       drbd_disk_str(device->state.disk));
+               put_ldev(device);
+               return -EIO;
+       }
+
+       /* protects md_io_buffer, al_tr_cycle, ... */
+       buffer = drbd_md_get_buffer(device, __func__);
+       if (!buffer) {
+               drbd_err(device, "disk failed while waiting for md_io buffer\n");
+               put_ldev(device);
+               return -ENODEV;
+       }
+
+       err = __al_write_transaction(device, buffer);
+
+       drbd_md_put_buffer(device);
+       put_ldev(device);
+
+       return err;
+}
+
 
 void drbd_al_begin_io_commit(struct drbd_device *device)
 {
@@ -420,153 +575,6 @@ void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i)
        wake_up(&device->al_wait);
 }
 
-#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)
-/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT
- * are still coupled, or assume too much about their relation.
- * Code below will not work if this is violated.
- * Will be cleaned up with some followup patch.
- */
-# error FIXME
-#endif
-
-static unsigned int al_extent_to_bm_page(unsigned int al_enr)
-{
-       return al_enr >>
-               /* bit to page */
-               ((PAGE_SHIFT + 3) -
-               /* al extent number to bit */
-                (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT));
-}
-
-static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device)
-{
-       const unsigned int stripes = device->ldev->md.al_stripes;
-       const unsigned int stripe_size_4kB = device->ldev->md.al_stripe_size_4k;
-
-       /* transaction number, modulo on-disk ring buffer wrap around */
-       unsigned int t = device->al_tr_number % (device->ldev->md.al_size_4k);
-
-       /* ... to aligned 4k on disk block */
-       t = ((t % stripes) * stripe_size_4kB) + t/stripes;
-
-       /* ... to 512 byte sector in activity log */
-       t *= 8;
-
-       /* ... plus offset to the on disk position */
-       return device->ldev->md.md_offset + device->ldev->md.al_offset + t;
-}
-
-int al_write_transaction(struct drbd_device *device)
-{
-       struct al_transaction_on_disk *buffer;
-       struct lc_element *e;
-       sector_t sector;
-       int i, mx;
-       unsigned extent_nr;
-       unsigned crc = 0;
-       int err = 0;
-
-       if (!get_ldev(device)) {
-               drbd_err(device, "disk is %s, cannot start al transaction\n",
-                       drbd_disk_str(device->state.disk));
-               return -EIO;
-       }
-
-       /* The bitmap write may have failed, causing a state change. */
-       if (device->state.disk < D_INCONSISTENT) {
-               drbd_err(device,
-                       "disk is %s, cannot write al transaction\n",
-                       drbd_disk_str(device->state.disk));
-               put_ldev(device);
-               return -EIO;
-       }
-
-       /* protects md_io_buffer, al_tr_cycle, ... */
-       buffer = drbd_md_get_buffer(device, __func__);
-       if (!buffer) {
-               drbd_err(device, "disk failed while waiting for md_io buffer\n");
-               put_ldev(device);
-               return -ENODEV;
-       }
-
-       memset(buffer, 0, sizeof(*buffer));
-       buffer->magic = cpu_to_be32(DRBD_AL_MAGIC);
-       buffer->tr_number = cpu_to_be32(device->al_tr_number);
-
-       i = 0;
-
-       /* Even though no one can start to change this list
-        * once we set the LC_LOCKED -- from drbd_al_begin_io(),
-        * lc_try_lock_for_transaction() --, someone may still
-        * be in the process of changing it. */
-       spin_lock_irq(&device->al_lock);
-       list_for_each_entry(e, &device->act_log->to_be_changed, list) {
-               if (i == AL_UPDATES_PER_TRANSACTION) {
-                       i++;
-                       break;
-               }
-               buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index);
-               buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number);
-               if (e->lc_number != LC_FREE)
-                       drbd_bm_mark_for_writeout(device,
-                                       al_extent_to_bm_page(e->lc_number));
-               i++;
-       }
-       spin_unlock_irq(&device->al_lock);
-       BUG_ON(i > AL_UPDATES_PER_TRANSACTION);
-
-       buffer->n_updates = cpu_to_be16(i);
-       for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) {
-               buffer->update_slot_nr[i] = cpu_to_be16(-1);
-               buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE);
-       }
-
-       buffer->context_size = cpu_to_be16(device->act_log->nr_elements);
-       buffer->context_start_slot_nr = cpu_to_be16(device->al_tr_cycle);
-
-       mx = min_t(int, AL_CONTEXT_PER_TRANSACTION,
-                  device->act_log->nr_elements - device->al_tr_cycle);
-       for (i = 0; i < mx; i++) {
-               unsigned idx = device->al_tr_cycle + i;
-               extent_nr = lc_element_by_index(device->act_log, idx)->lc_number;
-               buffer->context[i] = cpu_to_be32(extent_nr);
-       }
-       for (; i < AL_CONTEXT_PER_TRANSACTION; i++)
-               buffer->context[i] = cpu_to_be32(LC_FREE);
-
-       device->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION;
-       if (device->al_tr_cycle >= device->act_log->nr_elements)
-               device->al_tr_cycle = 0;
-
-       sector = al_tr_number_to_on_disk_sector(device);
-
-       crc = crc32c(0, buffer, 4096);
-       buffer->crc32c = cpu_to_be32(crc);
-
-       if (drbd_bm_write_hinted(device))
-               err = -EIO;
-       else {
-               bool write_al_updates;
-               rcu_read_lock();
-               write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates;
-               rcu_read_unlock();
-               if (write_al_updates) {
-                       if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) {
-                               err = -EIO;
-                               drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
-                       } else {
-                               device->al_tr_number++;
-                               device->al_writ_cnt++;
-                       }
-               }
-       }
-
-       drbd_md_put_buffer(device);
-       put_ldev(device);
-
-       return err;
-}
-
 static int _try_lc_del(struct drbd_device *device, struct lc_element *al_ext)
 {
        int rv;
@@ -606,21 +614,24 @@ void drbd_al_shrink(struct drbd_device *device)
        wake_up(&device->al_wait);
 }
 
-int drbd_initialize_al(struct drbd_device *device, void *buffer)
+int drbd_al_initialize(struct drbd_device *device, void *buffer)
 {
        struct al_transaction_on_disk *al = buffer;
        struct drbd_md *md = &device->ldev->md;
-       sector_t al_base = md->md_offset + md->al_offset;
        int al_size_4k = md->al_stripes * md->al_stripe_size_4k;
        int i;
 
-       memset(al, 0, 4096);
-       al->magic = cpu_to_be32(DRBD_AL_MAGIC);
-       al->transaction_type = cpu_to_be16(AL_TR_INITIALIZED);
-       al->crc32c = cpu_to_be32(crc32c(0, al, 4096));
+       __al_write_transaction(device, al);
+       /* There may or may not have been a pending transaction. */
+       spin_lock_irq(&device->al_lock);
+       lc_committed(device->act_log);
+       spin_unlock_irq(&device->al_lock);
 
-       for (i = 0; i < al_size_4k; i++) {
-               int err = drbd_md_sync_page_io(device, device->ldev, al_base + i * 8, WRITE);
+       /* The rest of the transactions will have an empty "updates" list, and
+        * are written out only to provide the context, and to initialize the
+        * on-disk ring buffer. */
+       for (i = 1; i < al_size_4k; i++) {
+               int err = __al_write_transaction(device, al);
                if (err)
                        return err;
        }
index 9462d27528507d693d8e4efe0e6464597ab1768b..92d6fc020a657c0694cc02c99d134fdf47c85c12 100644 (file)
@@ -24,7 +24,7 @@
 
 #define pr_fmt(fmt)    KBUILD_MODNAME ": " fmt
 
-#include <linux/bitops.h>
+#include <linux/bitmap.h>
 #include <linux/vmalloc.h>
 #include <linux/string.h>
 #include <linux/drbd.h>
@@ -364,12 +364,9 @@ static void bm_free_pages(struct page **pages, unsigned long number)
        }
 }
 
-static void bm_vk_free(void *ptr, int v)
+static inline void bm_vk_free(void *ptr)
 {
-       if (v)
-               vfree(ptr);
-       else
-               kfree(ptr);
+       kvfree(ptr);
 }
 
 /*
@@ -379,7 +376,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
 {
        struct page **old_pages = b->bm_pages;
        struct page **new_pages, *page;
-       unsigned int i, bytes, vmalloced = 0;
+       unsigned int i, bytes;
        unsigned long have = b->bm_number_of_pages;
 
        BUG_ON(have == 0 && old_pages != NULL);
@@ -401,7 +398,6 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
                                PAGE_KERNEL);
                if (!new_pages)
                        return NULL;
-               vmalloced = 1;
        }
 
        if (want >= have) {
@@ -411,7 +407,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
                        page = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
                        if (!page) {
                                bm_free_pages(new_pages + have, i - have);
-                               bm_vk_free(new_pages, vmalloced);
+                               bm_vk_free(new_pages);
                                return NULL;
                        }
                        /* we want to know which page it is
@@ -427,11 +423,6 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
                */
        }
 
-       if (vmalloced)
-               b->bm_flags |= BM_P_VMALLOCED;
-       else
-               b->bm_flags &= ~BM_P_VMALLOCED;
-
        return new_pages;
 }
 
@@ -469,7 +460,7 @@ void drbd_bm_cleanup(struct drbd_device *device)
        if (!expect(device->bitmap))
                return;
        bm_free_pages(device->bitmap->bm_pages, device->bitmap->bm_number_of_pages);
-       bm_vk_free(device->bitmap->bm_pages, (BM_P_VMALLOCED & device->bitmap->bm_flags));
+       bm_vk_free(device->bitmap->bm_pages);
        kfree(device->bitmap);
        device->bitmap = NULL;
 }
@@ -479,8 +470,14 @@ void drbd_bm_cleanup(struct drbd_device *device)
  * this masks out the remaining bits.
  * Returns the number of bits cleared.
  */
+#ifndef BITS_PER_PAGE
 #define BITS_PER_PAGE          (1UL << (PAGE_SHIFT + 3))
 #define BITS_PER_PAGE_MASK     (BITS_PER_PAGE - 1)
+#else
+# if BITS_PER_PAGE != (1UL << (PAGE_SHIFT + 3))
+#  error "ambiguous BITS_PER_PAGE"
+# endif
+#endif
 #define BITS_PER_LONG_MASK     (BITS_PER_LONG - 1)
 static int bm_clear_surplus(struct drbd_bitmap *b)
 {
@@ -559,21 +556,19 @@ static unsigned long bm_count_bits(struct drbd_bitmap *b)
        unsigned long *p_addr;
        unsigned long bits = 0;
        unsigned long mask = (1UL << (b->bm_bits & BITS_PER_LONG_MASK)) -1;
-       int idx, i, last_word;
+       int idx, last_word;
 
        /* all but last page */
        for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) {
                p_addr = __bm_map_pidx(b, idx);
-               for (i = 0; i < LWPP; i++)
-                       bits += hweight_long(p_addr[i]);
+               bits += bitmap_weight(p_addr, BITS_PER_PAGE);
                __bm_unmap(p_addr);
                cond_resched();
        }
        /* last (or only) page */
        last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL;
        p_addr = __bm_map_pidx(b, idx);
-       for (i = 0; i < last_word; i++)
-               bits += hweight_long(p_addr[i]);
+       bits += bitmap_weight(p_addr, last_word * BITS_PER_LONG);
        p_addr[last_word] &= cpu_to_lel(mask);
        bits += hweight_long(p_addr[last_word]);
        /* 32bit arch, may have an unused padding long */
@@ -639,7 +634,6 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi
        unsigned long want, have, onpages; /* number of pages */
        struct page **npages, **opages = NULL;
        int err = 0, growing;
-       int opages_vmalloced;
 
        if (!expect(b))
                return -ENOMEM;
@@ -652,8 +646,6 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi
        if (capacity == b->bm_dev_capacity)
                goto out;
 
-       opages_vmalloced = (BM_P_VMALLOCED & b->bm_flags);
-
        if (capacity == 0) {
                spin_lock_irq(&b->bm_lock);
                opages = b->bm_pages;
@@ -667,7 +659,7 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi
                b->bm_dev_capacity = 0;
                spin_unlock_irq(&b->bm_lock);
                bm_free_pages(opages, onpages);
-               bm_vk_free(opages, opages_vmalloced);
+               bm_vk_free(opages);
                goto out;
        }
        bits  = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT));
@@ -740,7 +732,7 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi
 
        spin_unlock_irq(&b->bm_lock);
        if (opages != npages)
-               bm_vk_free(opages, opages_vmalloced);
+               bm_vk_free(opages);
        if (!growing)
                b->bm_set = bm_count_bits(b);
        drbd_info(device, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want);
@@ -1419,6 +1411,9 @@ static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
        int bits;
        int changed = 0;
        unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]);
+
+       /* I think it is more cache line friendly to hweight_long then set to ~0UL,
+        * than to first bitmap_weight() all words, then bitmap_fill() all words */
        for (i = first_word; i < last_word; i++) {
                bits = hweight_long(paddr[i]);
                paddr[i] = ~0UL;
@@ -1628,8 +1623,7 @@ int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr)
                int n = e-s;
                p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
                bm = p_addr + MLPP(s);
-               while (n--)
-                       count += hweight_long(*bm++);
+               count += bitmap_weight(bm, n * BITS_PER_LONG);
                bm_unmap(p_addr);
        } else {
                drbd_err(device, "start offset (%d) too large in drbd_bm_e_weight\n", s);
index 6b88a35fb048ed8f302f6025b743d30d7646ec41..4de95bbff4860b2d39ad6fc40cbf95ab849929b1 100644 (file)
@@ -434,12 +434,12 @@ static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, vo
        if (!parent || d_really_is_negative(parent))
                goto out;
        /* serialize with d_delete() */
-       mutex_lock(&d_inode(parent)->i_mutex);
+       inode_lock(d_inode(parent));
        /* Make sure the object is still alive */
        if (simple_positive(file->f_path.dentry)
        && kref_get_unless_zero(kref))
                ret = 0;
-       mutex_unlock(&d_inode(parent)->i_mutex);
+       inode_unlock(d_inode(parent));
        if (!ret) {
                ret = single_open(file, show, data);
                if (ret)
@@ -771,6 +771,13 @@ static int device_data_gen_id_show(struct seq_file *m, void *ignored)
        return 0;
 }
 
+static int device_ed_gen_id_show(struct seq_file *m, void *ignored)
+{
+       struct drbd_device *device = m->private;
+       seq_printf(m, "0x%016llX\n", (unsigned long long)device->ed_uuid);
+       return 0;
+}
+
 #define drbd_debugfs_device_attr(name)                                         \
 static int device_ ## name ## _open(struct inode *inode, struct file *file)    \
 {                                                                              \
@@ -796,6 +803,7 @@ drbd_debugfs_device_attr(oldest_requests)
 drbd_debugfs_device_attr(act_log_extents)
 drbd_debugfs_device_attr(resync_extents)
 drbd_debugfs_device_attr(data_gen_id)
+drbd_debugfs_device_attr(ed_gen_id)
 
 void drbd_debugfs_device_add(struct drbd_device *device)
 {
@@ -839,6 +847,7 @@ void drbd_debugfs_device_add(struct drbd_device *device)
        DCF(act_log_extents);
        DCF(resync_extents);
        DCF(data_gen_id);
+       DCF(ed_gen_id);
 #undef DCF
        return;
 
@@ -854,6 +863,7 @@ void drbd_debugfs_device_cleanup(struct drbd_device *device)
        drbd_debugfs_remove(&device->debugfs_vol_act_log_extents);
        drbd_debugfs_remove(&device->debugfs_vol_resync_extents);
        drbd_debugfs_remove(&device->debugfs_vol_data_gen_id);
+       drbd_debugfs_remove(&device->debugfs_vol_ed_gen_id);
        drbd_debugfs_remove(&device->debugfs_vol);
 }
 
index e66d453a5f2b1bcdf10ffa367917205081bd7587..34bc84efc29e99085d9ccdeb6a4fa49f5b079446 100644 (file)
@@ -77,13 +77,6 @@ extern int fault_devs;
 extern char usermode_helper[];
 
 
-/* I don't remember why XCPU ...
- * This is used to wake the asender,
- * and to interrupt sending the sending task
- * on disconnect.
- */
-#define DRBD_SIG SIGXCPU
-
 /* This is used to stop/restart our threads.
  * Cannot use SIGTERM nor SIGKILL, since these
  * are sent out by init on runlevel changes
@@ -292,6 +285,9 @@ struct drbd_device_work {
 
 extern int drbd_wait_misc(struct drbd_device *, struct drbd_interval *);
 
+extern void lock_all_resources(void);
+extern void unlock_all_resources(void);
+
 struct drbd_request {
        struct drbd_work w;
        struct drbd_device *device;
@@ -504,7 +500,6 @@ enum {
 
        MD_NO_FUA,              /* Users wants us to not use FUA/FLUSH on meta data dev */
 
-       SUSPEND_IO,             /* suspend application io */
        BITMAP_IO,              /* suspend application io;
                                   once no more io in flight, start bitmap io */
        BITMAP_IO_QUEUED,       /* Started bitmap IO */
@@ -541,9 +536,6 @@ struct drbd_bitmap; /* opaque for drbd_device */
 /* definition of bits in bm_flags to be used in drbd_bm_lock
  * and drbd_bitmap_io and friends. */
 enum bm_flag {
-       /* do we need to kfree, or vfree bm_pages? */
-       BM_P_VMALLOCED = 0x10000, /* internal use only, will be masked out */
-
        /* currently locked for bulk operation */
        BM_LOCKED_MASK = 0xf,
 
@@ -632,12 +624,6 @@ struct bm_io_work {
        void (*done)(struct drbd_device *device, int rv);
 };
 
-enum write_ordering_e {
-       WO_none,
-       WO_drain_io,
-       WO_bdev_flush,
-};
-
 struct fifo_buffer {
        unsigned int head_index;
        unsigned int size;
@@ -650,8 +636,7 @@ extern struct fifo_buffer *fifo_alloc(int fifo_size);
 enum {
        NET_CONGESTED,          /* The data socket is congested */
        RESOLVE_CONFLICTS,      /* Set on one node, cleared on the peer! */
-       SEND_PING,              /* whether asender should send a ping asap */
-       SIGNAL_ASENDER,         /* whether asender wants to be interrupted */
+       SEND_PING,
        GOT_PING_ACK,           /* set when we receive a ping_ack packet, ping_wait gets woken */
        CONN_WD_ST_CHG_REQ,     /* A cluster wide state change on the connection is active */
        CONN_WD_ST_CHG_OKAY,
@@ -670,6 +655,8 @@ enum {
        DEVICE_WORK_PENDING,    /* tell worker that some device has pending work */
 };
 
+enum which_state { NOW, OLD = NOW, NEW };
+
 struct drbd_resource {
        char *name;
 #ifdef CONFIG_DEBUG_FS
@@ -755,7 +742,8 @@ struct drbd_connection {
        unsigned long last_reconnect_jif;
        struct drbd_thread receiver;
        struct drbd_thread worker;
-       struct drbd_thread asender;
+       struct drbd_thread ack_receiver;
+       struct workqueue_struct *ack_sender;
 
        /* cached pointers,
         * so we can look up the oldest pending requests more quickly.
@@ -774,6 +762,8 @@ struct drbd_connection {
        struct drbd_thread_timing_details r_timing_details[DRBD_THREAD_DETAILS_HIST];
 
        struct {
+               unsigned long last_sent_barrier_jif;
+
                /* whether this sender thread
                 * has processed a single write yet. */
                bool seen_any_write_yet;
@@ -788,6 +778,17 @@ struct drbd_connection {
        } send;
 };
 
+static inline bool has_net_conf(struct drbd_connection *connection)
+{
+       bool has_net_conf;
+
+       rcu_read_lock();
+       has_net_conf = rcu_dereference(connection->net_conf);
+       rcu_read_unlock();
+
+       return has_net_conf;
+}
+
 void __update_timing_details(
                struct drbd_thread_timing_details *tdp,
                unsigned int *cb_nr,
@@ -811,6 +812,7 @@ struct drbd_peer_device {
        struct list_head peer_devices;
        struct drbd_device *device;
        struct drbd_connection *connection;
+       struct work_struct send_acks_work;
 #ifdef CONFIG_DEBUG_FS
        struct dentry *debugfs_peer_dev;
 #endif
@@ -829,6 +831,7 @@ struct drbd_device {
        struct dentry *debugfs_vol_act_log_extents;
        struct dentry *debugfs_vol_resync_extents;
        struct dentry *debugfs_vol_data_gen_id;
+       struct dentry *debugfs_vol_ed_gen_id;
 #endif
 
        unsigned int vnr;       /* volume number within the connection */
@@ -873,6 +876,7 @@ struct drbd_device {
        atomic_t rs_pending_cnt; /* RS request/data packets on the wire */
        atomic_t unacked_cnt;    /* Need to send replies for */
        atomic_t local_cnt;      /* Waiting for local completion */
+       atomic_t suspend_cnt;
 
        /* Interval tree of pending local requests */
        struct rb_root read_requests;
@@ -1020,6 +1024,12 @@ static inline struct drbd_peer_device *first_peer_device(struct drbd_device *dev
        return list_first_entry_or_null(&device->peer_devices, struct drbd_peer_device, peer_devices);
 }
 
+static inline struct drbd_peer_device *
+conn_peer_device(struct drbd_connection *connection, int volume_number)
+{
+       return idr_find(&connection->peer_devices, volume_number);
+}
+
 #define for_each_resource(resource, _resources) \
        list_for_each_entry(resource, _resources, resources)
 
@@ -1113,7 +1123,7 @@ extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int
 extern int drbd_send_bitmap(struct drbd_device *device);
 extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode);
 extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode);
-extern void drbd_free_ldev(struct drbd_backing_dev *ldev);
+extern void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev);
 extern void drbd_device_cleanup(struct drbd_device *device);
 void drbd_print_uuids(struct drbd_device *device, const char *text);
 
@@ -1424,7 +1434,7 @@ extern struct bio_set *drbd_md_io_bio_set;
 /* to allocate from that set */
 extern struct bio *bio_alloc_drbd(gfp_t gfp_mask);
 
-extern rwlock_t global_state_lock;
+extern struct mutex resources_mutex;
 
 extern int conn_lowest_minor(struct drbd_connection *connection);
 extern enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor);
@@ -1454,6 +1464,9 @@ extern int is_valid_ar_handle(struct drbd_request *, sector_t);
 
 
 /* drbd_nl.c */
+
+extern struct mutex notification_mutex;
+
 extern void drbd_suspend_io(struct drbd_device *device);
 extern void drbd_resume_io(struct drbd_device *device);
 extern char *ppsize(char *buf, unsigned long long size);
@@ -1536,7 +1549,9 @@ extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req);
 
 /* drbd_receiver.c */
 extern int drbd_receiver(struct drbd_thread *thi);
-extern int drbd_asender(struct drbd_thread *thi);
+extern int drbd_ack_receiver(struct drbd_thread *thi);
+extern void drbd_send_ping_wf(struct work_struct *ws);
+extern void drbd_send_acks_wf(struct work_struct *ws);
 extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device);
 extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
                bool throttle_if_app_is_waiting);
@@ -1649,7 +1664,7 @@ extern int __drbd_change_sync(struct drbd_device *device, sector_t sector, int s
 #define drbd_rs_failed_io(device, sector, size) \
        __drbd_change_sync(device, sector, size, RECORD_RS_FAILED)
 extern void drbd_al_shrink(struct drbd_device *device);
-extern int drbd_initialize_al(struct drbd_device *, void *);
+extern int drbd_al_initialize(struct drbd_device *, void *);
 
 /* drbd_nl.c */
 /* state info broadcast */
@@ -1668,6 +1683,29 @@ struct sib_info {
 };
 void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib);
 
+extern void notify_resource_state(struct sk_buff *,
+                                 unsigned int,
+                                 struct drbd_resource *,
+                                 struct resource_info *,
+                                 enum drbd_notification_type);
+extern void notify_device_state(struct sk_buff *,
+                               unsigned int,
+                               struct drbd_device *,
+                               struct device_info *,
+                               enum drbd_notification_type);
+extern void notify_connection_state(struct sk_buff *,
+                                   unsigned int,
+                                   struct drbd_connection *,
+                                   struct connection_info *,
+                                   enum drbd_notification_type);
+extern void notify_peer_device_state(struct sk_buff *,
+                                    unsigned int,
+                                    struct drbd_peer_device *,
+                                    struct peer_device_info *,
+                                    enum drbd_notification_type);
+extern void notify_helper(enum drbd_notification_type, struct drbd_device *,
+                         struct drbd_connection *, const char *, int);
+
 /*
  * inline helper functions
  *************************/
@@ -1694,19 +1732,6 @@ static inline int drbd_peer_req_has_active_page(struct drbd_peer_request *peer_r
        return 0;
 }
 
-static inline enum drbd_state_rv
-_drbd_set_state(struct drbd_device *device, union drbd_state ns,
-               enum chg_state_flags flags, struct completion *done)
-{
-       enum drbd_state_rv rv;
-
-       read_lock(&global_state_lock);
-       rv = __drbd_set_state(device, ns, flags, done);
-       read_unlock(&global_state_lock);
-
-       return rv;
-}
-
 static inline union drbd_state drbd_read_state(struct drbd_device *device)
 {
        struct drbd_resource *resource = device->resource;
@@ -1937,16 +1962,21 @@ drbd_device_post_work(struct drbd_device *device, int work_bit)
 
 extern void drbd_flush_workqueue(struct drbd_work_queue *work_queue);
 
-static inline void wake_asender(struct drbd_connection *connection)
+/* To get the ack_receiver out of the blocking network stack,
+ * so it can change its sk_rcvtimeo from idle- to ping-timeout,
+ * and send a ping, we need to send a signal.
+ * Which signal we send is irrelevant. */
+static inline void wake_ack_receiver(struct drbd_connection *connection)
 {
-       if (test_bit(SIGNAL_ASENDER, &connection->flags))
-               force_sig(DRBD_SIG, connection->asender.task);
+       struct task_struct *task = connection->ack_receiver.task;
+       if (task && get_t_state(&connection->ack_receiver) == RUNNING)
+               force_sig(SIGXCPU, task);
 }
 
 static inline void request_ping(struct drbd_connection *connection)
 {
        set_bit(SEND_PING, &connection->flags);
-       wake_asender(connection);
+       wake_ack_receiver(connection);
 }
 
 extern void *conn_prepare_command(struct drbd_connection *, struct drbd_socket *);
@@ -2230,7 +2260,7 @@ static inline bool may_inc_ap_bio(struct drbd_device *device)
 
        if (drbd_suspended(device))
                return false;
-       if (test_bit(SUSPEND_IO, &device->flags))
+       if (atomic_read(&device->suspend_cnt))
                return false;
 
        /* to avoid potential deadlock or bitmap corruption,
index 74d97f4bac3488ac767ef97b20dfb3e2277e50cf..5b43dfb798191d0c90cc3b5115956575e56410f9 100644 (file)
@@ -117,6 +117,7 @@ module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0
  */
 struct idr drbd_devices;
 struct list_head drbd_resources;
+struct mutex resources_mutex;
 
 struct kmem_cache *drbd_request_cache;
 struct kmem_cache *drbd_ee_cache;      /* peer requests */
@@ -1435,8 +1436,8 @@ static int we_should_drop_the_connection(struct drbd_connection *connection, str
        /* long elapsed = (long)(jiffies - device->last_received); */
 
        drop_it =   connection->meta.socket == sock
-               || !connection->asender.task
-               || get_t_state(&connection->asender) != RUNNING
+               || !connection->ack_receiver.task
+               || get_t_state(&connection->ack_receiver) != RUNNING
                || connection->cstate < C_WF_REPORT_PARAMS;
 
        if (drop_it)
@@ -1793,15 +1794,6 @@ int drbd_send(struct drbd_connection *connection, struct socket *sock,
                drbd_update_congested(connection);
        }
        do {
-               /* STRANGE
-                * tcp_sendmsg does _not_ use its size parameter at all ?
-                *
-                * -EAGAIN on timeout, -EINTR on signal.
-                */
-/* THINK
- * do we need to block DRBD_SIG if sock == &meta.socket ??
- * otherwise wake_asender() might interrupt some send_*Ack !
- */
                rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
                if (rv == -EAGAIN) {
                        if (we_should_drop_the_connection(connection, sock))
@@ -2000,7 +1992,7 @@ void drbd_device_cleanup(struct drbd_device *device)
                drbd_bm_cleanup(device);
        }
 
-       drbd_free_ldev(device->ldev);
+       drbd_backing_dev_free(device, device->ldev);
        device->ldev = NULL;
 
        clear_bit(AL_SUSPENDED, &device->flags);
@@ -2179,7 +2171,7 @@ void drbd_destroy_device(struct kref *kref)
        if (device->this_bdev)
                bdput(device->this_bdev);
 
-       drbd_free_ldev(device->ldev);
+       drbd_backing_dev_free(device, device->ldev);
        device->ldev = NULL;
 
        drbd_release_all_peer_reqs(device);
@@ -2563,7 +2555,7 @@ int set_resource_options(struct drbd_resource *resource, struct res_opts *res_op
                cpumask_copy(resource->cpu_mask, new_cpu_mask);
                for_each_connection_rcu(connection, resource) {
                        connection->receiver.reset_cpu_mask = 1;
-                       connection->asender.reset_cpu_mask = 1;
+                       connection->ack_receiver.reset_cpu_mask = 1;
                        connection->worker.reset_cpu_mask = 1;
                }
        }
@@ -2590,7 +2582,7 @@ struct drbd_resource *drbd_create_resource(const char *name)
        kref_init(&resource->kref);
        idr_init(&resource->devices);
        INIT_LIST_HEAD(&resource->connections);
-       resource->write_ordering = WO_bdev_flush;
+       resource->write_ordering = WO_BDEV_FLUSH;
        list_add_tail_rcu(&resource->resources, &drbd_resources);
        mutex_init(&resource->conf_update);
        mutex_init(&resource->adm_mutex);
@@ -2652,8 +2644,8 @@ struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts)
        connection->receiver.connection = connection;
        drbd_thread_init(resource, &connection->worker, drbd_worker, "worker");
        connection->worker.connection = connection;
-       drbd_thread_init(resource, &connection->asender, drbd_asender, "asender");
-       connection->asender.connection = connection;
+       drbd_thread_init(resource, &connection->ack_receiver, drbd_ack_receiver, "ack_recv");
+       connection->ack_receiver.connection = connection;
 
        kref_init(&connection->kref);
 
@@ -2702,8 +2694,8 @@ static int init_submitter(struct drbd_device *device)
 {
        /* opencoded create_singlethread_workqueue(),
         * to be able to say "drbd%d", ..., minor */
-       device->submit.wq = alloc_workqueue("drbd%u_submit",
-                       WQ_UNBOUND | WQ_MEM_RECLAIM, 1, device->minor);
+       device->submit.wq =
+               alloc_ordered_workqueue("drbd%u_submit", WQ_MEM_RECLAIM, device->minor);
        if (!device->submit.wq)
                return -ENOMEM;
 
@@ -2820,6 +2812,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
                        goto out_idr_remove_from_resource;
                }
                kref_get(&connection->kref);
+               INIT_WORK(&peer_device->send_acks_work, drbd_send_acks_wf);
        }
 
        if (init_submitter(device)) {
@@ -2923,7 +2916,7 @@ static int __init drbd_init(void)
        drbd_proc = NULL; /* play safe for drbd_cleanup */
        idr_init(&drbd_devices);
 
-       rwlock_init(&global_state_lock);
+       mutex_init(&resources_mutex);
        INIT_LIST_HEAD(&drbd_resources);
 
        err = drbd_genl_register();
@@ -2971,18 +2964,6 @@ fail:
        return err;
 }
 
-void drbd_free_ldev(struct drbd_backing_dev *ldev)
-{
-       if (ldev == NULL)
-               return;
-
-       blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
-       blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
-
-       kfree(ldev->disk_conf);
-       kfree(ldev);
-}
-
 static void drbd_free_one_sock(struct drbd_socket *ds)
 {
        struct socket *s;
@@ -3277,6 +3258,10 @@ int drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev)
         * and read it. */
        bdev->md.meta_dev_idx = bdev->disk_conf->meta_dev_idx;
        bdev->md.md_offset = drbd_md_ss(bdev);
+       /* Even for (flexible or indexed) external meta data,
+        * initially restrict us to the 4k superblock for now.
+        * Affects the paranoia out-of-range access check in drbd_md_sync_page_io(). */
+       bdev->md.md_size_sect = 8;
 
        if (drbd_md_sync_page_io(device, bdev, bdev->md.md_offset, READ)) {
                /* NOTE: can't do normal error processing here as this is
@@ -3578,7 +3563,9 @@ void drbd_queue_bitmap_io(struct drbd_device *device,
 
        spin_lock_irq(&device->resource->req_lock);
        set_bit(BITMAP_IO, &device->flags);
-       if (atomic_read(&device->ap_bio_cnt) == 0) {
+       /* don't wait for pending application IO if the caller indicates that
+        * application IO does not conflict anyways. */
+       if (flags == BM_LOCKED_CHANGE_ALLOWED || atomic_read(&device->ap_bio_cnt) == 0) {
                if (!test_and_set_bit(BITMAP_IO_QUEUED, &device->flags))
                        drbd_queue_work(&first_peer_device(device)->connection->sender_work,
                                        &device->bm_io_work.w);
@@ -3746,6 +3733,27 @@ int drbd_wait_misc(struct drbd_device *device, struct drbd_interval *i)
        return 0;
 }
 
+void lock_all_resources(void)
+{
+       struct drbd_resource *resource;
+       int __maybe_unused i = 0;
+
+       mutex_lock(&resources_mutex);
+       local_irq_disable();
+       for_each_resource(resource, &drbd_resources)
+               spin_lock_nested(&resource->req_lock, i++);
+}
+
+void unlock_all_resources(void)
+{
+       struct drbd_resource *resource;
+
+       for_each_resource(resource, &drbd_resources)
+               spin_unlock(&resource->req_lock);
+       local_irq_enable();
+       mutex_unlock(&resources_mutex);
+}
+
 #ifdef CONFIG_DRBD_FAULT_INJECTION
 /* Fault insertion support including random number generator shamelessly
  * stolen from kernel/rcutorture.c */
index e80cbefbc2b548761fe5f531773b1022a21353eb..c055c5e12f248dc77b60c89d4b723f0de3f934b3 100644 (file)
@@ -36,6 +36,7 @@
 #include "drbd_int.h"
 #include "drbd_protocol.h"
 #include "drbd_req.h"
+#include "drbd_state_change.h"
 #include <asm/unaligned.h>
 #include <linux/drbd_limits.h>
 #include <linux/kthread.h>
@@ -75,11 +76,24 @@ int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info);
 int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info);
 /* .dumpit */
 int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb);
+int drbd_adm_dump_resources(struct sk_buff *skb, struct netlink_callback *cb);
+int drbd_adm_dump_devices(struct sk_buff *skb, struct netlink_callback *cb);
+int drbd_adm_dump_devices_done(struct netlink_callback *cb);
+int drbd_adm_dump_connections(struct sk_buff *skb, struct netlink_callback *cb);
+int drbd_adm_dump_connections_done(struct netlink_callback *cb);
+int drbd_adm_dump_peer_devices(struct sk_buff *skb, struct netlink_callback *cb);
+int drbd_adm_dump_peer_devices_done(struct netlink_callback *cb);
+int drbd_adm_get_initial_state(struct sk_buff *skb, struct netlink_callback *cb);
 
 #include <linux/drbd_genl_api.h>
 #include "drbd_nla.h"
 #include <linux/genl_magic_func.h>
 
+static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */
+static atomic_t notify_genl_seq = ATOMIC_INIT(2); /* two. */
+
+DEFINE_MUTEX(notification_mutex);
+
 /* used blkdev_get_by_path, to claim our meta data device(s) */
 static char *drbd_m_holder = "Hands off! this is DRBD's meta data device.";
 
@@ -349,6 +363,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd)
        sib.sib_reason = SIB_HELPER_PRE;
        sib.helper_name = cmd;
        drbd_bcast_event(device, &sib);
+       notify_helper(NOTIFY_CALL, device, connection, cmd, 0);
        ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
        if (ret)
                drbd_warn(device, "helper command: %s %s %s exit code %u (0x%x)\n",
@@ -361,6 +376,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd)
        sib.sib_reason = SIB_HELPER_POST;
        sib.helper_exit_code = ret;
        drbd_bcast_event(device, &sib);
+       notify_helper(NOTIFY_RESPONSE, device, connection, cmd, ret);
 
        if (current == connection->worker.task)
                clear_bit(CALLBACK_PENDING, &connection->flags);
@@ -388,6 +404,7 @@ static int conn_khelper(struct drbd_connection *connection, char *cmd)
 
        drbd_info(connection, "helper command: %s %s %s\n", usermode_helper, cmd, resource_name);
        /* TODO: conn_bcast_event() ?? */
+       notify_helper(NOTIFY_CALL, NULL, connection, cmd, 0);
 
        ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
        if (ret)
@@ -399,6 +416,7 @@ static int conn_khelper(struct drbd_connection *connection, char *cmd)
                          usermode_helper, cmd, resource_name,
                          (ret >> 8) & 0xff, ret);
        /* TODO: conn_bcast_event() ?? */
+       notify_helper(NOTIFY_RESPONSE, NULL, connection, cmd, ret);
 
        if (ret < 0) /* Ignore any ERRNOs we got. */
                ret = 0;
@@ -847,9 +865,11 @@ char *ppsize(char *buf, unsigned long long size)
  * and can be long lived.
  * This changes an device->flag, is triggered by drbd internals,
  * and should be short-lived. */
+/* It needs to be a counter, since multiple threads might
+   independently suspend and resume IO. */
 void drbd_suspend_io(struct drbd_device *device)
 {
-       set_bit(SUSPEND_IO, &device->flags);
+       atomic_inc(&device->suspend_cnt);
        if (drbd_suspended(device))
                return;
        wait_event(device->misc_wait, !atomic_read(&device->ap_bio_cnt));
@@ -857,8 +877,8 @@ void drbd_suspend_io(struct drbd_device *device)
 
 void drbd_resume_io(struct drbd_device *device)
 {
-       clear_bit(SUSPEND_IO, &device->flags);
-       wake_up(&device->misc_wait);
+       if (atomic_dec_and_test(&device->suspend_cnt))
+               wake_up(&device->misc_wait);
 }
 
 /**
@@ -871,27 +891,32 @@ void drbd_resume_io(struct drbd_device *device)
 enum determine_dev_size
 drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct resize_parms *rs) __must_hold(local)
 {
-       sector_t prev_first_sect, prev_size; /* previous meta location */
-       sector_t la_size_sect, u_size;
+       struct md_offsets_and_sizes {
+               u64 last_agreed_sect;
+               u64 md_offset;
+               s32 al_offset;
+               s32 bm_offset;
+               u32 md_size_sect;
+
+               u32 al_stripes;
+               u32 al_stripe_size_4k;
+       } prev;
+       sector_t u_size, size;
        struct drbd_md *md = &device->ldev->md;
-       u32 prev_al_stripe_size_4k;
-       u32 prev_al_stripes;
-       sector_t size;
        char ppb[10];
        void *buffer;
 
        int md_moved, la_size_changed;
        enum determine_dev_size rv = DS_UNCHANGED;
 
-       /* race:
-        * application request passes inc_ap_bio,
-        * but then cannot get an AL-reference.
-        * this function later may wait on ap_bio_cnt == 0. -> deadlock.
+       /* We may change the on-disk offsets of our meta data below.  Lock out
+        * anything that may cause meta data IO, to avoid acting on incomplete
+        * layout changes or scribbling over meta data that is in the process
+        * of being moved.
         *
-        * to avoid that:
-        * Suspend IO right here.
-        * still lock the act_log to not trigger ASSERTs there.
-        */
+        * Move is not exactly correct, btw, currently we have all our meta
+        * data in core memory, to "move" it we just write it all out, there
+        * are no reads. */
        drbd_suspend_io(device);
        buffer = drbd_md_get_buffer(device, __func__); /* Lock meta-data IO */
        if (!buffer) {
@@ -899,19 +924,17 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
                return DS_ERROR;
        }
 
-       /* no wait necessary anymore, actually we could assert that */
-       wait_event(device->al_wait, lc_try_lock(device->act_log));
-
-       prev_first_sect = drbd_md_first_sector(device->ldev);
-       prev_size = device->ldev->md.md_size_sect;
-       la_size_sect = device->ldev->md.la_size_sect;
+       /* remember current offset and sizes */
+       prev.last_agreed_sect = md->la_size_sect;
+       prev.md_offset = md->md_offset;
+       prev.al_offset = md->al_offset;
+       prev.bm_offset = md->bm_offset;
+       prev.md_size_sect = md->md_size_sect;
+       prev.al_stripes = md->al_stripes;
+       prev.al_stripe_size_4k = md->al_stripe_size_4k;
 
        if (rs) {
                /* rs is non NULL if we should change the AL layout only */
-
-               prev_al_stripes = md->al_stripes;
-               prev_al_stripe_size_4k = md->al_stripe_size_4k;
-
                md->al_stripes = rs->al_stripes;
                md->al_stripe_size_4k = rs->al_stripe_size / 4;
                md->al_size_4k = (u64)rs->al_stripes * rs->al_stripe_size / 4;
@@ -924,7 +947,7 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
        rcu_read_unlock();
        size = drbd_new_dev_size(device, device->ldev, u_size, flags & DDSF_FORCED);
 
-       if (size < la_size_sect) {
+       if (size < prev.last_agreed_sect) {
                if (rs && u_size == 0) {
                        /* Remove "rs &&" later. This check should always be active, but
                           right now the receiver expects the permissive behavior */
@@ -945,30 +968,29 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
                err = drbd_bm_resize(device, size, !(flags & DDSF_NO_RESYNC));
                if (unlikely(err)) {
                        /* currently there is only one error: ENOMEM! */
-                       size = drbd_bm_capacity(device)>>1;
+                       size = drbd_bm_capacity(device);
                        if (size == 0) {
                                drbd_err(device, "OUT OF MEMORY! "
                                    "Could not allocate bitmap!\n");
                        } else {
                                drbd_err(device, "BM resizing failed. "
-                                   "Leaving size unchanged at size = %lu KB\n",
-                                   (unsigned long)size);
+                                   "Leaving size unchanged\n");
                        }
                        rv = DS_ERROR;
                }
                /* racy, see comments above. */
                drbd_set_my_capacity(device, size);
-               device->ldev->md.la_size_sect = size;
+               md->la_size_sect = size;
                drbd_info(device, "size = %s (%llu KB)\n", ppsize(ppb, size>>1),
                     (unsigned long long)size>>1);
        }
        if (rv <= DS_ERROR)
                goto err_out;
 
-       la_size_changed = (la_size_sect != device->ldev->md.la_size_sect);
+       la_size_changed = (prev.last_agreed_sect != md->la_size_sect);
 
-       md_moved = prev_first_sect != drbd_md_first_sector(device->ldev)
-               || prev_size       != device->ldev->md.md_size_sect;
+       md_moved = prev.md_offset    != md->md_offset
+               || prev.md_size_sect != md->md_size_sect;
 
        if (la_size_changed || md_moved || rs) {
                u32 prev_flags;
@@ -977,20 +999,29 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
                 * Clear the timer, to avoid scary "timer expired!" messages,
                 * "Superblock" is written out at least twice below, anyways. */
                del_timer(&device->md_sync_timer);
-               drbd_al_shrink(device); /* All extents inactive. */
 
+               /* We won't change the "al-extents" setting, we just may need
+                * to move the on-disk location of the activity log ringbuffer.
+                * Lock for transaction is good enough, it may well be "dirty"
+                * or even "starving". */
+               wait_event(device->al_wait, lc_try_lock_for_transaction(device->act_log));
+
+               /* mark current on-disk bitmap and activity log as unreliable */
                prev_flags = md->flags;
-               md->flags &= ~MDF_PRIMARY_IND;
+               md->flags |= MDF_FULL_SYNC | MDF_AL_DISABLED;
                drbd_md_write(device, buffer);
 
+               drbd_al_initialize(device, buffer);
+
                drbd_info(device, "Writing the whole bitmap, %s\n",
                         la_size_changed && md_moved ? "size changed and md moved" :
                         la_size_changed ? "size changed" : "md moved");
                /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */
                drbd_bitmap_io(device, md_moved ? &drbd_bm_write_all : &drbd_bm_write,
                               "size changed", BM_LOCKED_MASK);
-               drbd_initialize_al(device, buffer);
 
+               /* on-disk bitmap and activity log is authoritative again
+                * (unless there was an IO error meanwhile...) */
                md->flags = prev_flags;
                drbd_md_write(device, buffer);
 
@@ -999,20 +1030,22 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct
                                  md->al_stripes, md->al_stripe_size_4k * 4);
        }
 
-       if (size > la_size_sect)
-               rv = la_size_sect ? DS_GREW : DS_GREW_FROM_ZERO;
-       if (size < la_size_sect)
+       if (size > prev.last_agreed_sect)
+               rv = prev.last_agreed_sect ? DS_GREW : DS_GREW_FROM_ZERO;
+       if (size < prev.last_agreed_sect)
                rv = DS_SHRUNK;
 
        if (0) {
        err_out:
-               if (rs) {
-                       md->al_stripes = prev_al_stripes;
-                       md->al_stripe_size_4k = prev_al_stripe_size_4k;
-                       md->al_size_4k = (u64)prev_al_stripes * prev_al_stripe_size_4k;
-
-                       drbd_md_set_sector_offsets(device, device->ldev);
-               }
+               /* restore previous offset and sizes */
+               md->la_size_sect = prev.last_agreed_sect;
+               md->md_offset = prev.md_offset;
+               md->al_offset = prev.al_offset;
+               md->bm_offset = prev.bm_offset;
+               md->md_size_sect = prev.md_size_sect;
+               md->al_stripes = prev.al_stripes;
+               md->al_stripe_size_4k = prev.al_stripe_size_4k;
+               md->al_size_4k = (u64)prev.al_stripes * prev.al_stripe_size_4k;
        }
        lc_unlock(device->act_log);
        wake_up(&device->al_wait);
@@ -1115,8 +1148,7 @@ static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc)
                lc_destroy(n);
                return -EBUSY;
        } else {
-               if (t)
-                       lc_destroy(t);
+               lc_destroy(t);
        }
        drbd_md_mark_dirty(device); /* we changed device->act_log->nr_elemens */
        return 0;
@@ -1151,21 +1183,20 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi
        if (b) {
                struct drbd_connection *connection = first_peer_device(device)->connection;
 
+               blk_queue_max_discard_sectors(q, DRBD_MAX_DISCARD_SECTORS);
+
                if (blk_queue_discard(b) &&
                    (connection->cstate < C_CONNECTED || connection->agreed_features & FF_TRIM)) {
-                       /* For now, don't allow more than one activity log extent worth of data
-                        * to be discarded in one go. We may need to rework drbd_al_begin_io()
-                        * to allow for even larger discard ranges */
-                       blk_queue_max_discard_sectors(q, DRBD_MAX_DISCARD_SECTORS);
-
+                       /* We don't care, stacking below should fix it for the local device.
+                        * Whether or not it is a suitable granularity on the remote device
+                        * is not our problem, really. If you care, you need to
+                        * use devices with similar topology on all peers. */
+                       q->limits.discard_granularity = 512;
                        queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
-                       /* REALLY? Is stacking secdiscard "legal"? */
-                       if (blk_queue_secdiscard(b))
-                               queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, q);
                } else {
                        blk_queue_max_discard_sectors(q, 0);
                        queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
-                       queue_flag_clear_unlocked(QUEUE_FLAG_SECDISCARD, q);
+                       q->limits.discard_granularity = 0;
                }
 
                blk_queue_stack_limits(q, b);
@@ -1177,6 +1208,12 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi
                        q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
                }
        }
+       /* To avoid confusion, if this queue does not support discard, clear
+        * max_discard_sectors, which is what lsblk -D reports to the user.  */
+       if (!blk_queue_discard(q)) {
+               blk_queue_max_discard_sectors(q, 0);
+               q->limits.discard_granularity = 0;
+       }
 }
 
 void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev)
@@ -1241,8 +1278,8 @@ static void conn_reconfig_done(struct drbd_connection *connection)
                connection->cstate == C_STANDALONE;
        spin_unlock_irq(&connection->resource->req_lock);
        if (stop_threads) {
-               /* asender is implicitly stopped by receiver
-                * in conn_disconnect() */
+               /* ack_receiver thread and ack_sender workqueue are implicitly
+                * stopped by receiver in conn_disconnect() */
                drbd_thread_stop(&connection->receiver);
                drbd_thread_stop(&connection->worker);
        }
@@ -1389,13 +1426,13 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
                goto fail_unlock;
        }
 
-       write_lock_irq(&global_state_lock);
+       lock_all_resources();
        retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
        if (retcode == NO_ERROR) {
                rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
                drbd_resync_after_changed(device);
        }
-       write_unlock_irq(&global_state_lock);
+       unlock_all_resources();
 
        if (retcode != NO_ERROR)
                goto fail_unlock;
@@ -1418,7 +1455,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
                set_bit(MD_NO_FUA, &device->flags);
 
        if (write_ordering_changed(old_disk_conf, new_disk_conf))
-               drbd_bump_write_ordering(device->resource, NULL, WO_bdev_flush);
+               drbd_bump_write_ordering(device->resource, NULL, WO_BDEV_FLUSH);
 
        drbd_md_sync(device);
 
@@ -1449,6 +1486,88 @@ success:
        return 0;
 }
 
+static struct block_device *open_backing_dev(struct drbd_device *device,
+               const char *bdev_path, void *claim_ptr, bool do_bd_link)
+{
+       struct block_device *bdev;
+       int err = 0;
+
+       bdev = blkdev_get_by_path(bdev_path,
+                                 FMODE_READ | FMODE_WRITE | FMODE_EXCL, claim_ptr);
+       if (IS_ERR(bdev)) {
+               drbd_err(device, "open(\"%s\") failed with %ld\n",
+                               bdev_path, PTR_ERR(bdev));
+               return bdev;
+       }
+
+       if (!do_bd_link)
+               return bdev;
+
+       err = bd_link_disk_holder(bdev, device->vdisk);
+       if (err) {
+               blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+               drbd_err(device, "bd_link_disk_holder(\"%s\", ...) failed with %d\n",
+                               bdev_path, err);
+               bdev = ERR_PTR(err);
+       }
+       return bdev;
+}
+
+static int open_backing_devices(struct drbd_device *device,
+               struct disk_conf *new_disk_conf,
+               struct drbd_backing_dev *nbc)
+{
+       struct block_device *bdev;
+
+       bdev = open_backing_dev(device, new_disk_conf->backing_dev, device, true);
+       if (IS_ERR(bdev))
+               return ERR_OPEN_DISK;
+       nbc->backing_bdev = bdev;
+
+       /*
+        * meta_dev_idx >= 0: external fixed size, possibly multiple
+        * drbd sharing one meta device.  TODO in that case, paranoia
+        * check that [md_bdev, meta_dev_idx] is not yet used by some
+        * other drbd minor!  (if you use drbd.conf + drbdadm, that
+        * should check it for you already; but if you don't, or
+        * someone fooled it, we need to double check here)
+        */
+       bdev = open_backing_dev(device, new_disk_conf->meta_dev,
+               /* claim ptr: device, if claimed exclusively; shared drbd_m_holder,
+                * if potentially shared with other drbd minors */
+                       (new_disk_conf->meta_dev_idx < 0) ? (void*)device : (void*)drbd_m_holder,
+               /* avoid double bd_claim_by_disk() for the same (source,target) tuple,
+                * as would happen with internal metadata. */
+                       (new_disk_conf->meta_dev_idx != DRBD_MD_INDEX_FLEX_INT &&
+                        new_disk_conf->meta_dev_idx != DRBD_MD_INDEX_INTERNAL));
+       if (IS_ERR(bdev))
+               return ERR_OPEN_MD_DISK;
+       nbc->md_bdev = bdev;
+       return NO_ERROR;
+}
+
+static void close_backing_dev(struct drbd_device *device, struct block_device *bdev,
+       bool do_bd_unlink)
+{
+       if (!bdev)
+               return;
+       if (do_bd_unlink)
+               bd_unlink_disk_holder(bdev, device->vdisk);
+       blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+}
+
+void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev)
+{
+       if (ldev == NULL)
+               return;
+
+       close_backing_dev(device, ldev->md_bdev, ldev->md_bdev != ldev->backing_bdev);
+       close_backing_dev(device, ldev->backing_bdev, true);
+
+       kfree(ldev->disk_conf);
+       kfree(ldev);
+}
+
 int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 {
        struct drbd_config_context adm_ctx;
@@ -1462,7 +1581,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
        sector_t min_md_device_sectors;
        struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
        struct disk_conf *new_disk_conf = NULL;
-       struct block_device *bdev;
        struct lru_cache *resync_lru = NULL;
        struct fifo_buffer *new_plan = NULL;
        union drbd_state ns, os;
@@ -1478,7 +1596,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
        device = adm_ctx.device;
        mutex_lock(&adm_ctx.resource->adm_mutex);
        peer_device = first_peer_device(device);
-       connection = peer_device ? peer_device->connection : NULL;
+       connection = peer_device->connection;
        conn_reconfig_start(connection);
 
        /* if you want to reconfigure, please tear down first */
@@ -1539,12 +1657,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
                goto fail;
        }
 
-       write_lock_irq(&global_state_lock);
-       retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
-       write_unlock_irq(&global_state_lock);
-       if (retcode != NO_ERROR)
-               goto fail;
-
        rcu_read_lock();
        nc = rcu_dereference(connection->net_conf);
        if (nc) {
@@ -1556,35 +1668,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
        }
        rcu_read_unlock();
 
-       bdev = blkdev_get_by_path(new_disk_conf->backing_dev,
-                                 FMODE_READ | FMODE_WRITE | FMODE_EXCL, device);
-       if (IS_ERR(bdev)) {
-               drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->backing_dev,
-                       PTR_ERR(bdev));
-               retcode = ERR_OPEN_DISK;
-               goto fail;
-       }
-       nbc->backing_bdev = bdev;
-
-       /*
-        * meta_dev_idx >= 0: external fixed size, possibly multiple
-        * drbd sharing one meta device.  TODO in that case, paranoia
-        * check that [md_bdev, meta_dev_idx] is not yet used by some
-        * other drbd minor!  (if you use drbd.conf + drbdadm, that
-        * should check it for you already; but if you don't, or
-        * someone fooled it, we need to double check here)
-        */
-       bdev = blkdev_get_by_path(new_disk_conf->meta_dev,
-                                 FMODE_READ | FMODE_WRITE | FMODE_EXCL,
-                                 (new_disk_conf->meta_dev_idx < 0) ?
-                                 (void *)device : (void *)drbd_m_holder);
-       if (IS_ERR(bdev)) {
-               drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->meta_dev,
-                       PTR_ERR(bdev));
-               retcode = ERR_OPEN_MD_DISK;
+       retcode = open_backing_devices(device, new_disk_conf, nbc);
+       if (retcode != NO_ERROR)
                goto fail;
-       }
-       nbc->md_bdev = bdev;
 
        if ((nbc->backing_bdev == nbc->md_bdev) !=
            (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
@@ -1707,6 +1793,13 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
                goto force_diskless_dec;
        }
 
+       lock_all_resources();
+       retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
+       if (retcode != NO_ERROR) {
+               unlock_all_resources();
+               goto force_diskless_dec;
+       }
+
        /* Reset the "barriers don't work" bits here, then force meta data to
         * be written, to ensure we determine if barriers are supported. */
        if (new_disk_conf->md_flushes)
@@ -1727,7 +1820,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
        new_disk_conf = NULL;
        new_plan = NULL;
 
-       drbd_bump_write_ordering(device->resource, device->ldev, WO_bdev_flush);
+       drbd_resync_after_changed(device);
+       drbd_bump_write_ordering(device->resource, device->ldev, WO_BDEV_FLUSH);
+       unlock_all_resources();
 
        if (drbd_md_test_flag(device->ldev, MDF_CRASHED_PRIMARY))
                set_bit(CRASHED_PRIMARY, &device->flags);
@@ -1875,12 +1970,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
  fail:
        conn_reconfig_done(connection);
        if (nbc) {
-               if (nbc->backing_bdev)
-                       blkdev_put(nbc->backing_bdev,
-                                  FMODE_READ | FMODE_WRITE | FMODE_EXCL);
-               if (nbc->md_bdev)
-                       blkdev_put(nbc->md_bdev,
-                                  FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+               close_backing_dev(device, nbc->md_bdev, nbc->md_bdev != nbc->backing_bdev);
+               close_backing_dev(device, nbc->backing_bdev, true);
                kfree(nbc);
        }
        kfree(new_disk_conf);
@@ -1895,6 +1986,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 static int adm_detach(struct drbd_device *device, int force)
 {
        enum drbd_state_rv retcode;
+       void *buffer;
        int ret;
 
        if (force) {
@@ -1905,13 +1997,16 @@ static int adm_detach(struct drbd_device *device, int force)
        }
 
        drbd_suspend_io(device); /* so no-one is stuck in drbd_al_begin_io */
-       drbd_md_get_buffer(device, __func__); /* make sure there is no in-flight meta-data IO */
-       retcode = drbd_request_state(device, NS(disk, D_FAILED));
-       drbd_md_put_buffer(device);
+       buffer = drbd_md_get_buffer(device, __func__); /* make sure there is no in-flight meta-data IO */
+       if (buffer) {
+               retcode = drbd_request_state(device, NS(disk, D_FAILED));
+               drbd_md_put_buffer(device);
+       } else /* already <= D_FAILED */
+               retcode = SS_NOTHING_TO_DO;
        /* D_FAILED will transition to DISKLESS. */
+       drbd_resume_io(device);
        ret = wait_event_interruptible(device->misc_wait,
                        device->state.disk != D_FAILED);
-       drbd_resume_io(device);
        if ((int)retcode == (int)SS_IS_DISKLESS)
                retcode = SS_NOTHING_TO_DO;
        if (ret)
@@ -2245,8 +2340,31 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info)
        return 0;
 }
 
+static void connection_to_info(struct connection_info *info,
+                              struct drbd_connection *connection)
+{
+       info->conn_connection_state = connection->cstate;
+       info->conn_role = conn_highest_peer(connection);
+}
+
+static void peer_device_to_info(struct peer_device_info *info,
+                               struct drbd_peer_device *peer_device)
+{
+       struct drbd_device *device = peer_device->device;
+
+       info->peer_repl_state =
+               max_t(enum drbd_conns, C_WF_REPORT_PARAMS, device->state.conn);
+       info->peer_disk_state = device->state.pdsk;
+       info->peer_resync_susp_user = device->state.user_isp;
+       info->peer_resync_susp_peer = device->state.peer_isp;
+       info->peer_resync_susp_dependency = device->state.aftr_isp;
+}
+
 int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
 {
+       struct connection_info connection_info;
+       enum drbd_notification_type flags;
+       unsigned int peer_devices = 0;
        struct drbd_config_context adm_ctx;
        struct drbd_peer_device *peer_device;
        struct net_conf *old_net_conf, *new_net_conf = NULL;
@@ -2347,6 +2465,22 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
        connection->peer_addr_len = nla_len(adm_ctx.peer_addr);
        memcpy(&connection->peer_addr, nla_data(adm_ctx.peer_addr), connection->peer_addr_len);
 
+       idr_for_each_entry(&connection->peer_devices, peer_device, i) {
+               peer_devices++;
+       }
+
+       connection_to_info(&connection_info, connection);
+       flags = (peer_devices--) ? NOTIFY_CONTINUES : 0;
+       mutex_lock(&notification_mutex);
+       notify_connection_state(NULL, 0, connection, &connection_info, NOTIFY_CREATE | flags);
+       idr_for_each_entry(&connection->peer_devices, peer_device, i) {
+               struct peer_device_info peer_device_info;
+
+               peer_device_to_info(&peer_device_info, peer_device);
+               flags = (peer_devices--) ? NOTIFY_CONTINUES : 0;
+               notify_peer_device_state(NULL, 0, peer_device, &peer_device_info, NOTIFY_CREATE | flags);
+       }
+       mutex_unlock(&notification_mutex);
        mutex_unlock(&adm_ctx.resource->conf_update);
 
        rcu_read_lock();
@@ -2428,6 +2562,8 @@ static enum drbd_state_rv conn_try_disconnect(struct drbd_connection *connection
                        drbd_err(connection,
                                "unexpected rv2=%d in conn_try_disconnect()\n",
                                rv2);
+               /* Unlike in DRBD 9, the state engine has generated
+                * NOTIFY_DESTROY events before clearing connection->net_conf. */
        }
        return rv;
 }
@@ -2585,6 +2721,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
                mutex_unlock(&device->resource->conf_update);
                synchronize_rcu();
                kfree(old_disk_conf);
+               new_disk_conf = NULL;
        }
 
        ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
@@ -2618,6 +2755,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
 
  fail_ldev:
        put_ldev(device);
+       kfree(new_disk_conf);
        goto fail;
 }
 
@@ -2855,7 +2993,30 @@ int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info)
        mutex_lock(&adm_ctx.resource->adm_mutex);
        device = adm_ctx.device;
        if (test_bit(NEW_CUR_UUID, &device->flags)) {
-               drbd_uuid_new_current(device);
+               if (get_ldev_if_state(device, D_ATTACHING)) {
+                       drbd_uuid_new_current(device);
+                       put_ldev(device);
+               } else {
+                       /* This is effectively a multi-stage "forced down".
+                        * The NEW_CUR_UUID bit is supposedly only set, if we
+                        * lost the replication connection, and are configured
+                        * to freeze IO and wait for some fence-peer handler.
+                        * So we still don't have a replication connection.
+                        * And now we don't have a local disk either.  After
+                        * resume, we will fail all pending and new IO, because
+                        * we don't have any data anymore.  Which means we will
+                        * eventually be able to terminate all users of this
+                        * device, and then take it down.  By bumping the
+                        * "effective" data uuid, we make sure that you really
+                        * need to tear down before you reconfigure, we will
+                        * the refuse to re-connect or re-attach (because no
+                        * matching real data uuid exists).
+                        */
+                       u64 val;
+                       get_random_bytes(&val, sizeof(u64));
+                       drbd_set_ed_uuid(device, val);
+                       drbd_warn(device, "Resumed without access to data; please tear down before attempting to re-configure.\n");
+               }
                clear_bit(NEW_CUR_UUID, &device->flags);
        }
        drbd_suspend_io(device);
@@ -2909,6 +3070,486 @@ nla_put_failure:
        return -EMSGSIZE;
 }
 
+/*
+ * The generic netlink dump callbacks are called outside the genl_lock(), so
+ * they cannot use the simple attribute parsing code which uses global
+ * attribute tables.
+ */
+static struct nlattr *find_cfg_context_attr(const struct nlmsghdr *nlh, int attr)
+{
+       const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ;
+       const int maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1;
+       struct nlattr *nla;
+
+       nla = nla_find(nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen),
+                      DRBD_NLA_CFG_CONTEXT);
+       if (!nla)
+               return NULL;
+       return drbd_nla_find_nested(maxtype, nla, __nla_type(attr));
+}
+
+static void resource_to_info(struct resource_info *, struct drbd_resource *);
+
+int drbd_adm_dump_resources(struct sk_buff *skb, struct netlink_callback *cb)
+{
+       struct drbd_genlmsghdr *dh;
+       struct drbd_resource *resource;
+       struct resource_info resource_info;
+       struct resource_statistics resource_statistics;
+       int err;
+
+       rcu_read_lock();
+       if (cb->args[0]) {
+               for_each_resource_rcu(resource, &drbd_resources)
+                       if (resource == (struct drbd_resource *)cb->args[0])
+                               goto found_resource;
+               err = 0;  /* resource was probably deleted */
+               goto out;
+       }
+       resource = list_entry(&drbd_resources,
+                             struct drbd_resource, resources);
+
+found_resource:
+       list_for_each_entry_continue_rcu(resource, &drbd_resources, resources) {
+               goto put_result;
+       }
+       err = 0;
+       goto out;
+
+put_result:
+       dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+                       cb->nlh->nlmsg_seq, &drbd_genl_family,
+                       NLM_F_MULTI, DRBD_ADM_GET_RESOURCES);
+       err = -ENOMEM;
+       if (!dh)
+               goto out;
+       dh->minor = -1U;
+       dh->ret_code = NO_ERROR;
+       err = nla_put_drbd_cfg_context(skb, resource, NULL, NULL);
+       if (err)
+               goto out;
+       err = res_opts_to_skb(skb, &resource->res_opts, !capable(CAP_SYS_ADMIN));
+       if (err)
+               goto out;
+       resource_to_info(&resource_info, resource);
+       err = resource_info_to_skb(skb, &resource_info, !capable(CAP_SYS_ADMIN));
+       if (err)
+               goto out;
+       resource_statistics.res_stat_write_ordering = resource->write_ordering;
+       err = resource_statistics_to_skb(skb, &resource_statistics, !capable(CAP_SYS_ADMIN));
+       if (err)
+               goto out;
+       cb->args[0] = (long)resource;
+       genlmsg_end(skb, dh);
+       err = 0;
+
+out:
+       rcu_read_unlock();
+       if (err)
+               return err;
+       return skb->len;
+}
+
+static void device_to_statistics(struct device_statistics *s,
+                                struct drbd_device *device)
+{
+       memset(s, 0, sizeof(*s));
+       s->dev_upper_blocked = !may_inc_ap_bio(device);
+       if (get_ldev(device)) {
+               struct drbd_md *md = &device->ldev->md;
+               u64 *history_uuids = (u64 *)s->history_uuids;
+               struct request_queue *q;
+               int n;
+
+               spin_lock_irq(&md->uuid_lock);
+               s->dev_current_uuid = md->uuid[UI_CURRENT];
+               BUILD_BUG_ON(sizeof(s->history_uuids) < UI_HISTORY_END - UI_HISTORY_START + 1);
+               for (n = 0; n < UI_HISTORY_END - UI_HISTORY_START + 1; n++)
+                       history_uuids[n] = md->uuid[UI_HISTORY_START + n];
+               for (; n < HISTORY_UUIDS; n++)
+                       history_uuids[n] = 0;
+               s->history_uuids_len = HISTORY_UUIDS;
+               spin_unlock_irq(&md->uuid_lock);
+
+               s->dev_disk_flags = md->flags;
+               q = bdev_get_queue(device->ldev->backing_bdev);
+               s->dev_lower_blocked =
+                       bdi_congested(&q->backing_dev_info,
+                                     (1 << WB_async_congested) |
+                                     (1 << WB_sync_congested));
+               put_ldev(device);
+       }
+       s->dev_size = drbd_get_capacity(device->this_bdev);
+       s->dev_read = device->read_cnt;
+       s->dev_write = device->writ_cnt;
+       s->dev_al_writes = device->al_writ_cnt;
+       s->dev_bm_writes = device->bm_writ_cnt;
+       s->dev_upper_pending = atomic_read(&device->ap_bio_cnt);
+       s->dev_lower_pending = atomic_read(&device->local_cnt);
+       s->dev_al_suspended = test_bit(AL_SUSPENDED, &device->flags);
+       s->dev_exposed_data_uuid = device->ed_uuid;
+}
+
+static int put_resource_in_arg0(struct netlink_callback *cb, int holder_nr)
+{
+       if (cb->args[0]) {
+               struct drbd_resource *resource =
+                       (struct drbd_resource *)cb->args[0];
+               kref_put(&resource->kref, drbd_destroy_resource);
+       }
+
+       return 0;
+}
+
+int drbd_adm_dump_devices_done(struct netlink_callback *cb) {
+       return put_resource_in_arg0(cb, 7);
+}
+
+static void device_to_info(struct device_info *, struct drbd_device *);
+
+int drbd_adm_dump_devices(struct sk_buff *skb, struct netlink_callback *cb)
+{
+       struct nlattr *resource_filter;
+       struct drbd_resource *resource;
+       struct drbd_device *uninitialized_var(device);
+       int minor, err, retcode;
+       struct drbd_genlmsghdr *dh;
+       struct device_info device_info;
+       struct device_statistics device_statistics;
+       struct idr *idr_to_search;
+
+       resource = (struct drbd_resource *)cb->args[0];
+       if (!cb->args[0] && !cb->args[1]) {
+               resource_filter = find_cfg_context_attr(cb->nlh, T_ctx_resource_name);
+               if (resource_filter) {
+                       retcode = ERR_RES_NOT_KNOWN;
+                       resource = drbd_find_resource(nla_data(resource_filter));
+                       if (!resource)
+                               goto put_result;
+                       cb->args[0] = (long)resource;
+               }
+       }
+
+       rcu_read_lock();
+       minor = cb->args[1];
+       idr_to_search = resource ? &resource->devices : &drbd_devices;
+       device = idr_get_next(idr_to_search, &minor);
+       if (!device) {
+               err = 0;
+               goto out;
+       }
+       idr_for_each_entry_continue(idr_to_search, device, minor) {
+               retcode = NO_ERROR;
+               goto put_result;  /* only one iteration */
+       }
+       err = 0;
+       goto out;  /* no more devices */
+
+put_result:
+       dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+                       cb->nlh->nlmsg_seq, &drbd_genl_family,
+                       NLM_F_MULTI, DRBD_ADM_GET_DEVICES);
+       err = -ENOMEM;
+       if (!dh)
+               goto out;
+       dh->ret_code = retcode;
+       dh->minor = -1U;
+       if (retcode == NO_ERROR) {
+               dh->minor = device->minor;
+               err = nla_put_drbd_cfg_context(skb, device->resource, NULL, device);
+               if (err)
+                       goto out;
+               if (get_ldev(device)) {
+                       struct disk_conf *disk_conf =
+                               rcu_dereference(device->ldev->disk_conf);
+
+                       err = disk_conf_to_skb(skb, disk_conf, !capable(CAP_SYS_ADMIN));
+                       put_ldev(device);
+                       if (err)
+                               goto out;
+               }
+               device_to_info(&device_info, device);
+               err = device_info_to_skb(skb, &device_info, !capable(CAP_SYS_ADMIN));
+               if (err)
+                       goto out;
+
+               device_to_statistics(&device_statistics, device);
+               err = device_statistics_to_skb(skb, &device_statistics, !capable(CAP_SYS_ADMIN));
+               if (err)
+                       goto out;
+               cb->args[1] = minor + 1;
+       }
+       genlmsg_end(skb, dh);
+       err = 0;
+
+out:
+       rcu_read_unlock();
+       if (err)
+               return err;
+       return skb->len;
+}
+
+int drbd_adm_dump_connections_done(struct netlink_callback *cb)
+{
+       return put_resource_in_arg0(cb, 6);
+}
+
+enum { SINGLE_RESOURCE, ITERATE_RESOURCES };
+
+int drbd_adm_dump_connections(struct sk_buff *skb, struct netlink_callback *cb)
+{
+       struct nlattr *resource_filter;
+       struct drbd_resource *resource = NULL, *next_resource;
+       struct drbd_connection *uninitialized_var(connection);
+       int err = 0, retcode;
+       struct drbd_genlmsghdr *dh;
+       struct connection_info connection_info;
+       struct connection_statistics connection_statistics;
+
+       rcu_read_lock();
+       resource = (struct drbd_resource *)cb->args[0];
+       if (!cb->args[0]) {
+               resource_filter = find_cfg_context_attr(cb->nlh, T_ctx_resource_name);
+               if (resource_filter) {
+                       retcode = ERR_RES_NOT_KNOWN;
+                       resource = drbd_find_resource(nla_data(resource_filter));
+                       if (!resource)
+                               goto put_result;
+                       cb->args[0] = (long)resource;
+                       cb->args[1] = SINGLE_RESOURCE;
+               }
+       }
+       if (!resource) {
+               if (list_empty(&drbd_resources))
+                       goto out;
+               resource = list_first_entry(&drbd_resources, struct drbd_resource, resources);
+               kref_get(&resource->kref);
+               cb->args[0] = (long)resource;
+               cb->args[1] = ITERATE_RESOURCES;
+       }
+
+    next_resource:
+       rcu_read_unlock();
+       mutex_lock(&resource->conf_update);
+       rcu_read_lock();
+       if (cb->args[2]) {
+               for_each_connection_rcu(connection, resource)
+                       if (connection == (struct drbd_connection *)cb->args[2])
+                               goto found_connection;
+               /* connection was probably deleted */
+               goto no_more_connections;
+       }
+       connection = list_entry(&resource->connections, struct drbd_connection, connections);
+
+found_connection:
+       list_for_each_entry_continue_rcu(connection, &resource->connections, connections) {
+               if (!has_net_conf(connection))
+                       continue;
+               retcode = NO_ERROR;
+               goto put_result;  /* only one iteration */
+       }
+
+no_more_connections:
+       if (cb->args[1] == ITERATE_RESOURCES) {
+               for_each_resource_rcu(next_resource, &drbd_resources) {
+                       if (next_resource == resource)
+                               goto found_resource;
+               }
+               /* resource was probably deleted */
+       }
+       goto out;
+
+found_resource:
+       list_for_each_entry_continue_rcu(next_resource, &drbd_resources, resources) {
+               mutex_unlock(&resource->conf_update);
+               kref_put(&resource->kref, drbd_destroy_resource);
+               resource = next_resource;
+               kref_get(&resource->kref);
+               cb->args[0] = (long)resource;
+               cb->args[2] = 0;
+               goto next_resource;
+       }
+       goto out;  /* no more resources */
+
+put_result:
+       dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+                       cb->nlh->nlmsg_seq, &drbd_genl_family,
+                       NLM_F_MULTI, DRBD_ADM_GET_CONNECTIONS);
+       err = -ENOMEM;
+       if (!dh)
+               goto out;
+       dh->ret_code = retcode;
+       dh->minor = -1U;
+       if (retcode == NO_ERROR) {
+               struct net_conf *net_conf;
+
+               err = nla_put_drbd_cfg_context(skb, resource, connection, NULL);
+               if (err)
+                       goto out;
+               net_conf = rcu_dereference(connection->net_conf);
+               if (net_conf) {
+                       err = net_conf_to_skb(skb, net_conf, !capable(CAP_SYS_ADMIN));
+                       if (err)
+                               goto out;
+               }
+               connection_to_info(&connection_info, connection);
+               err = connection_info_to_skb(skb, &connection_info, !capable(CAP_SYS_ADMIN));
+               if (err)
+                       goto out;
+               connection_statistics.conn_congested = test_bit(NET_CONGESTED, &connection->flags);
+               err = connection_statistics_to_skb(skb, &connection_statistics, !capable(CAP_SYS_ADMIN));
+               if (err)
+                       goto out;
+               cb->args[2] = (long)connection;
+       }
+       genlmsg_end(skb, dh);
+       err = 0;
+
+out:
+       rcu_read_unlock();
+       if (resource)
+               mutex_unlock(&resource->conf_update);
+       if (err)
+               return err;
+       return skb->len;
+}
+
+enum mdf_peer_flag {
+       MDF_PEER_CONNECTED =    1 << 0,
+       MDF_PEER_OUTDATED =     1 << 1,
+       MDF_PEER_FENCING =      1 << 2,
+       MDF_PEER_FULL_SYNC =    1 << 3,
+};
+
+static void peer_device_to_statistics(struct peer_device_statistics *s,
+                                     struct drbd_peer_device *peer_device)
+{
+       struct drbd_device *device = peer_device->device;
+
+       memset(s, 0, sizeof(*s));
+       s->peer_dev_received = device->recv_cnt;
+       s->peer_dev_sent = device->send_cnt;
+       s->peer_dev_pending = atomic_read(&device->ap_pending_cnt) +
+                             atomic_read(&device->rs_pending_cnt);
+       s->peer_dev_unacked = atomic_read(&device->unacked_cnt);
+       s->peer_dev_out_of_sync = drbd_bm_total_weight(device) << (BM_BLOCK_SHIFT - 9);
+       s->peer_dev_resync_failed = device->rs_failed << (BM_BLOCK_SHIFT - 9);
+       if (get_ldev(device)) {
+               struct drbd_md *md = &device->ldev->md;
+
+               spin_lock_irq(&md->uuid_lock);
+               s->peer_dev_bitmap_uuid = md->uuid[UI_BITMAP];
+               spin_unlock_irq(&md->uuid_lock);
+               s->peer_dev_flags =
+                       (drbd_md_test_flag(device->ldev, MDF_CONNECTED_IND) ?
+                               MDF_PEER_CONNECTED : 0) +
+                       (drbd_md_test_flag(device->ldev, MDF_CONSISTENT) &&
+                        !drbd_md_test_flag(device->ldev, MDF_WAS_UP_TO_DATE) ?
+                               MDF_PEER_OUTDATED : 0) +
+                       /* FIXME: MDF_PEER_FENCING? */
+                       (drbd_md_test_flag(device->ldev, MDF_FULL_SYNC) ?
+                               MDF_PEER_FULL_SYNC : 0);
+               put_ldev(device);
+       }
+}
+
+int drbd_adm_dump_peer_devices_done(struct netlink_callback *cb)
+{
+       return put_resource_in_arg0(cb, 9);
+}
+
+int drbd_adm_dump_peer_devices(struct sk_buff *skb, struct netlink_callback *cb)
+{
+       struct nlattr *resource_filter;
+       struct drbd_resource *resource;
+       struct drbd_device *uninitialized_var(device);
+       struct drbd_peer_device *peer_device = NULL;
+       int minor, err, retcode;
+       struct drbd_genlmsghdr *dh;
+       struct idr *idr_to_search;
+
+       resource = (struct drbd_resource *)cb->args[0];
+       if (!cb->args[0] && !cb->args[1]) {
+               resource_filter = find_cfg_context_attr(cb->nlh, T_ctx_resource_name);
+               if (resource_filter) {
+                       retcode = ERR_RES_NOT_KNOWN;
+                       resource = drbd_find_resource(nla_data(resource_filter));
+                       if (!resource)
+                               goto put_result;
+               }
+               cb->args[0] = (long)resource;
+       }
+
+       rcu_read_lock();
+       minor = cb->args[1];
+       idr_to_search = resource ? &resource->devices : &drbd_devices;
+       device = idr_find(idr_to_search, minor);
+       if (!device) {
+next_device:
+               minor++;
+               cb->args[2] = 0;
+               device = idr_get_next(idr_to_search, &minor);
+               if (!device) {
+                       err = 0;
+                       goto out;
+               }
+       }
+       if (cb->args[2]) {
+               for_each_peer_device(peer_device, device)
+                       if (peer_device == (struct drbd_peer_device *)cb->args[2])
+                               goto found_peer_device;
+               /* peer device was probably deleted */
+               goto next_device;
+       }
+       /* Make peer_device point to the list head (not the first entry). */
+       peer_device = list_entry(&device->peer_devices, struct drbd_peer_device, peer_devices);
+
+found_peer_device:
+       list_for_each_entry_continue_rcu(peer_device, &device->peer_devices, peer_devices) {
+               if (!has_net_conf(peer_device->connection))
+                       continue;
+               retcode = NO_ERROR;
+               goto put_result;  /* only one iteration */
+       }
+       goto next_device;
+
+put_result:
+       dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+                       cb->nlh->nlmsg_seq, &drbd_genl_family,
+                       NLM_F_MULTI, DRBD_ADM_GET_PEER_DEVICES);
+       err = -ENOMEM;
+       if (!dh)
+               goto out;
+       dh->ret_code = retcode;
+       dh->minor = -1U;
+       if (retcode == NO_ERROR) {
+               struct peer_device_info peer_device_info;
+               struct peer_device_statistics peer_device_statistics;
+
+               dh->minor = minor;
+               err = nla_put_drbd_cfg_context(skb, device->resource, peer_device->connection, device);
+               if (err)
+                       goto out;
+               peer_device_to_info(&peer_device_info, peer_device);
+               err = peer_device_info_to_skb(skb, &peer_device_info, !capable(CAP_SYS_ADMIN));
+               if (err)
+                       goto out;
+               peer_device_to_statistics(&peer_device_statistics, peer_device);
+               err = peer_device_statistics_to_skb(skb, &peer_device_statistics, !capable(CAP_SYS_ADMIN));
+               if (err)
+                       goto out;
+               cb->args[1] = minor;
+               cb->args[2] = (long)peer_device;
+       }
+       genlmsg_end(skb, dh);
+       err = 0;
+
+out:
+       rcu_read_unlock();
+       if (err)
+               return err;
+       return skb->len;
+}
 /*
  * Return the connection of @resource if @resource has exactly one connection.
  */
@@ -3414,8 +4055,18 @@ drbd_check_resource_name(struct drbd_config_context *adm_ctx)
        return NO_ERROR;
 }
 
+static void resource_to_info(struct resource_info *info,
+                            struct drbd_resource *resource)
+{
+       info->res_role = conn_highest_role(first_connection(resource));
+       info->res_susp = resource->susp;
+       info->res_susp_nod = resource->susp_nod;
+       info->res_susp_fen = resource->susp_fen;
+}
+
 int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info)
 {
+       struct drbd_connection *connection;
        struct drbd_config_context adm_ctx;
        enum drbd_ret_code retcode;
        struct res_opts res_opts;
@@ -3449,13 +4100,33 @@ int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info)
        }
 
        /* not yet safe for genl_family.parallel_ops */
-       if (!conn_create(adm_ctx.resource_name, &res_opts))
+       mutex_lock(&resources_mutex);
+       connection = conn_create(adm_ctx.resource_name, &res_opts);
+       mutex_unlock(&resources_mutex);
+
+       if (connection) {
+               struct resource_info resource_info;
+
+               mutex_lock(&notification_mutex);
+               resource_to_info(&resource_info, connection->resource);
+               notify_resource_state(NULL, 0, connection->resource,
+                                     &resource_info, NOTIFY_CREATE);
+               mutex_unlock(&notification_mutex);
+       } else
                retcode = ERR_NOMEM;
+
 out:
        drbd_adm_finish(&adm_ctx, info, retcode);
        return 0;
 }
 
+static void device_to_info(struct device_info *info,
+                          struct drbd_device *device)
+{
+       info->dev_disk_state = device->state.disk;
+}
+
+
 int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info)
 {
        struct drbd_config_context adm_ctx;
@@ -3490,6 +4161,36 @@ int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info)
 
        mutex_lock(&adm_ctx.resource->adm_mutex);
        retcode = drbd_create_device(&adm_ctx, dh->minor);
+       if (retcode == NO_ERROR) {
+               struct drbd_device *device;
+               struct drbd_peer_device *peer_device;
+               struct device_info info;
+               unsigned int peer_devices = 0;
+               enum drbd_notification_type flags;
+
+               device = minor_to_device(dh->minor);
+               for_each_peer_device(peer_device, device) {
+                       if (!has_net_conf(peer_device->connection))
+                               continue;
+                       peer_devices++;
+               }
+
+               device_to_info(&info, device);
+               mutex_lock(&notification_mutex);
+               flags = (peer_devices--) ? NOTIFY_CONTINUES : 0;
+               notify_device_state(NULL, 0, device, &info, NOTIFY_CREATE | flags);
+               for_each_peer_device(peer_device, device) {
+                       struct peer_device_info peer_device_info;
+
+                       if (!has_net_conf(peer_device->connection))
+                               continue;
+                       peer_device_to_info(&peer_device_info, peer_device);
+                       flags = (peer_devices--) ? NOTIFY_CONTINUES : 0;
+                       notify_peer_device_state(NULL, 0, peer_device, &peer_device_info,
+                                                NOTIFY_CREATE | flags);
+               }
+               mutex_unlock(&notification_mutex);
+       }
        mutex_unlock(&adm_ctx.resource->adm_mutex);
 out:
        drbd_adm_finish(&adm_ctx, info, retcode);
@@ -3498,13 +4199,35 @@ out:
 
 static enum drbd_ret_code adm_del_minor(struct drbd_device *device)
 {
+       struct drbd_peer_device *peer_device;
+
        if (device->state.disk == D_DISKLESS &&
            /* no need to be device->state.conn == C_STANDALONE &&
             * we may want to delete a minor from a live replication group.
             */
            device->state.role == R_SECONDARY) {
+               struct drbd_connection *connection =
+                       first_connection(device->resource);
+
                _drbd_request_state(device, NS(conn, C_WF_REPORT_PARAMS),
                                    CS_VERBOSE + CS_WAIT_COMPLETE);
+
+               /* If the state engine hasn't stopped the sender thread yet, we
+                * need to flush the sender work queue before generating the
+                * DESTROY events here. */
+               if (get_t_state(&connection->worker) == RUNNING)
+                       drbd_flush_workqueue(&connection->sender_work);
+
+               mutex_lock(&notification_mutex);
+               for_each_peer_device(peer_device, device) {
+                       if (!has_net_conf(peer_device->connection))
+                               continue;
+                       notify_peer_device_state(NULL, 0, peer_device, NULL,
+                                                NOTIFY_DESTROY | NOTIFY_CONTINUES);
+               }
+               notify_device_state(NULL, 0, device, NULL, NOTIFY_DESTROY);
+               mutex_unlock(&notification_mutex);
+
                drbd_delete_device(device);
                return NO_ERROR;
        } else
@@ -3541,7 +4264,16 @@ static int adm_del_resource(struct drbd_resource *resource)
        if (!idr_is_empty(&resource->devices))
                return ERR_RES_IN_USE;
 
+       /* The state engine has stopped the sender thread, so we don't
+        * need to flush the sender work queue before generating the
+        * DESTROY event here. */
+       mutex_lock(&notification_mutex);
+       notify_resource_state(NULL, 0, resource, NULL, NOTIFY_DESTROY);
+       mutex_unlock(&notification_mutex);
+
+       mutex_lock(&resources_mutex);
        list_del_rcu(&resource->resources);
+       mutex_unlock(&resources_mutex);
        /* Make sure all threads have actually stopped: state handling only
         * does drbd_thread_stop_nowait(). */
        list_for_each_entry(connection, &resource->connections, connections)
@@ -3637,7 +4369,6 @@ finish:
 
 void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib)
 {
-       static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */
        struct sk_buff *msg;
        struct drbd_genlmsghdr *d_out;
        unsigned seq;
@@ -3658,7 +4389,7 @@ void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib)
        if (nla_put_status_info(msg, device, sib))
                goto nla_put_failure;
        genlmsg_end(msg, d_out);
-       err = drbd_genl_multicast_events(msg, 0);
+       err = drbd_genl_multicast_events(msg, GFP_NOWAIT);
        /* msg has been consumed or freed in netlink_broadcast() */
        if (err && err != -ESRCH)
                goto failed;
@@ -3672,3 +4403,405 @@ failed:
                        "Event seq:%u sib_reason:%u\n",
                        err, seq, sib->sib_reason);
 }
+
+static int nla_put_notification_header(struct sk_buff *msg,
+                                      enum drbd_notification_type type)
+{
+       struct drbd_notification_header nh = {
+               .nh_type = type,
+       };
+
+       return drbd_notification_header_to_skb(msg, &nh, true);
+}
+
+void notify_resource_state(struct sk_buff *skb,
+                          unsigned int seq,
+                          struct drbd_resource *resource,
+                          struct resource_info *resource_info,
+                          enum drbd_notification_type type)
+{
+       struct resource_statistics resource_statistics;
+       struct drbd_genlmsghdr *dh;
+       bool multicast = false;
+       int err;
+
+       if (!skb) {
+               seq = atomic_inc_return(&notify_genl_seq);
+               skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
+               err = -ENOMEM;
+               if (!skb)
+                       goto failed;
+               multicast = true;
+       }
+
+       err = -EMSGSIZE;
+       dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_RESOURCE_STATE);
+       if (!dh)
+               goto nla_put_failure;
+       dh->minor = -1U;
+       dh->ret_code = NO_ERROR;
+       if (nla_put_drbd_cfg_context(skb, resource, NULL, NULL) ||
+           nla_put_notification_header(skb, type) ||
+           ((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY &&
+            resource_info_to_skb(skb, resource_info, true)))
+               goto nla_put_failure;
+       resource_statistics.res_stat_write_ordering = resource->write_ordering;
+       err = resource_statistics_to_skb(skb, &resource_statistics, !capable(CAP_SYS_ADMIN));
+       if (err)
+               goto nla_put_failure;
+       genlmsg_end(skb, dh);
+       if (multicast) {
+               err = drbd_genl_multicast_events(skb, GFP_NOWAIT);
+               /* skb has been consumed or freed in netlink_broadcast() */
+               if (err && err != -ESRCH)
+                       goto failed;
+       }
+       return;
+
+nla_put_failure:
+       nlmsg_free(skb);
+failed:
+       drbd_err(resource, "Error %d while broadcasting event. Event seq:%u\n",
+                       err, seq);
+}
+
+void notify_device_state(struct sk_buff *skb,
+                        unsigned int seq,
+                        struct drbd_device *device,
+                        struct device_info *device_info,
+                        enum drbd_notification_type type)
+{
+       struct device_statistics device_statistics;
+       struct drbd_genlmsghdr *dh;
+       bool multicast = false;
+       int err;
+
+       if (!skb) {
+               seq = atomic_inc_return(&notify_genl_seq);
+               skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
+               err = -ENOMEM;
+               if (!skb)
+                       goto failed;
+               multicast = true;
+       }
+
+       err = -EMSGSIZE;
+       dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_DEVICE_STATE);
+       if (!dh)
+               goto nla_put_failure;
+       dh->minor = device->minor;
+       dh->ret_code = NO_ERROR;
+       if (nla_put_drbd_cfg_context(skb, device->resource, NULL, device) ||
+           nla_put_notification_header(skb, type) ||
+           ((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY &&
+            device_info_to_skb(skb, device_info, true)))
+               goto nla_put_failure;
+       device_to_statistics(&device_statistics, device);
+       device_statistics_to_skb(skb, &device_statistics, !capable(CAP_SYS_ADMIN));
+       genlmsg_end(skb, dh);
+       if (multicast) {
+               err = drbd_genl_multicast_events(skb, GFP_NOWAIT);
+               /* skb has been consumed or freed in netlink_broadcast() */
+               if (err && err != -ESRCH)
+                       goto failed;
+       }
+       return;
+
+nla_put_failure:
+       nlmsg_free(skb);
+failed:
+       drbd_err(device, "Error %d while broadcasting event. Event seq:%u\n",
+                err, seq);
+}
+
+void notify_connection_state(struct sk_buff *skb,
+                            unsigned int seq,
+                            struct drbd_connection *connection,
+                            struct connection_info *connection_info,
+                            enum drbd_notification_type type)
+{
+       struct connection_statistics connection_statistics;
+       struct drbd_genlmsghdr *dh;
+       bool multicast = false;
+       int err;
+
+       if (!skb) {
+               seq = atomic_inc_return(&notify_genl_seq);
+               skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
+               err = -ENOMEM;
+               if (!skb)
+                       goto failed;
+               multicast = true;
+       }
+
+       err = -EMSGSIZE;
+       dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_CONNECTION_STATE);
+       if (!dh)
+               goto nla_put_failure;
+       dh->minor = -1U;
+       dh->ret_code = NO_ERROR;
+       if (nla_put_drbd_cfg_context(skb, connection->resource, connection, NULL) ||
+           nla_put_notification_header(skb, type) ||
+           ((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY &&
+            connection_info_to_skb(skb, connection_info, true)))
+               goto nla_put_failure;
+       connection_statistics.conn_congested = test_bit(NET_CONGESTED, &connection->flags);
+       connection_statistics_to_skb(skb, &connection_statistics, !capable(CAP_SYS_ADMIN));
+       genlmsg_end(skb, dh);
+       if (multicast) {
+               err = drbd_genl_multicast_events(skb, GFP_NOWAIT);
+               /* skb has been consumed or freed in netlink_broadcast() */
+               if (err && err != -ESRCH)
+                       goto failed;
+       }
+       return;
+
+nla_put_failure:
+       nlmsg_free(skb);
+failed:
+       drbd_err(connection, "Error %d while broadcasting event. Event seq:%u\n",
+                err, seq);
+}
+
+void notify_peer_device_state(struct sk_buff *skb,
+                             unsigned int seq,
+                             struct drbd_peer_device *peer_device,
+                             struct peer_device_info *peer_device_info,
+                             enum drbd_notification_type type)
+{
+       struct peer_device_statistics peer_device_statistics;
+       struct drbd_resource *resource = peer_device->device->resource;
+       struct drbd_genlmsghdr *dh;
+       bool multicast = false;
+       int err;
+
+       if (!skb) {
+               seq = atomic_inc_return(&notify_genl_seq);
+               skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
+               err = -ENOMEM;
+               if (!skb)
+                       goto failed;
+               multicast = true;
+       }
+
+       err = -EMSGSIZE;
+       dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_PEER_DEVICE_STATE);
+       if (!dh)
+               goto nla_put_failure;
+       dh->minor = -1U;
+       dh->ret_code = NO_ERROR;
+       if (nla_put_drbd_cfg_context(skb, resource, peer_device->connection, peer_device->device) ||
+           nla_put_notification_header(skb, type) ||
+           ((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY &&
+            peer_device_info_to_skb(skb, peer_device_info, true)))
+               goto nla_put_failure;
+       peer_device_to_statistics(&peer_device_statistics, peer_device);
+       peer_device_statistics_to_skb(skb, &peer_device_statistics, !capable(CAP_SYS_ADMIN));
+       genlmsg_end(skb, dh);
+       if (multicast) {
+               err = drbd_genl_multicast_events(skb, GFP_NOWAIT);
+               /* skb has been consumed or freed in netlink_broadcast() */
+               if (err && err != -ESRCH)
+                       goto failed;
+       }
+       return;
+
+nla_put_failure:
+       nlmsg_free(skb);
+failed:
+       drbd_err(peer_device, "Error %d while broadcasting event. Event seq:%u\n",
+                err, seq);
+}
+
+void notify_helper(enum drbd_notification_type type,
+                  struct drbd_device *device, struct drbd_connection *connection,
+                  const char *name, int status)
+{
+       struct drbd_resource *resource = device ? device->resource : connection->resource;
+       struct drbd_helper_info helper_info;
+       unsigned int seq = atomic_inc_return(&notify_genl_seq);
+       struct sk_buff *skb = NULL;
+       struct drbd_genlmsghdr *dh;
+       int err;
+
+       strlcpy(helper_info.helper_name, name, sizeof(helper_info.helper_name));
+       helper_info.helper_name_len = min(strlen(name), sizeof(helper_info.helper_name));
+       helper_info.helper_status = status;
+
+       skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
+       err = -ENOMEM;
+       if (!skb)
+               goto fail;
+
+       err = -EMSGSIZE;
+       dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_HELPER);
+       if (!dh)
+               goto fail;
+       dh->minor = device ? device->minor : -1;
+       dh->ret_code = NO_ERROR;
+       mutex_lock(&notification_mutex);
+       if (nla_put_drbd_cfg_context(skb, resource, connection, device) ||
+           nla_put_notification_header(skb, type) ||
+           drbd_helper_info_to_skb(skb, &helper_info, true))
+               goto unlock_fail;
+       genlmsg_end(skb, dh);
+       err = drbd_genl_multicast_events(skb, GFP_NOWAIT);
+       skb = NULL;
+       /* skb has been consumed or freed in netlink_broadcast() */
+       if (err && err != -ESRCH)
+               goto unlock_fail;
+       mutex_unlock(&notification_mutex);
+       return;
+
+unlock_fail:
+       mutex_unlock(&notification_mutex);
+fail:
+       nlmsg_free(skb);
+       drbd_err(resource, "Error %d while broadcasting event. Event seq:%u\n",
+                err, seq);
+}
+
+static void notify_initial_state_done(struct sk_buff *skb, unsigned int seq)
+{
+       struct drbd_genlmsghdr *dh;
+       int err;
+
+       err = -EMSGSIZE;
+       dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_INITIAL_STATE_DONE);
+       if (!dh)
+               goto nla_put_failure;
+       dh->minor = -1U;
+       dh->ret_code = NO_ERROR;
+       if (nla_put_notification_header(skb, NOTIFY_EXISTS))
+               goto nla_put_failure;
+       genlmsg_end(skb, dh);
+       return;
+
+nla_put_failure:
+       nlmsg_free(skb);
+       pr_err("Error %d sending event. Event seq:%u\n", err, seq);
+}
+
+static void free_state_changes(struct list_head *list)
+{
+       while (!list_empty(list)) {
+               struct drbd_state_change *state_change =
+                       list_first_entry(list, struct drbd_state_change, list);
+               list_del(&state_change->list);
+               forget_state_change(state_change);
+       }
+}
+
+static unsigned int notifications_for_state_change(struct drbd_state_change *state_change)
+{
+       return 1 +
+              state_change->n_connections +
+              state_change->n_devices +
+              state_change->n_devices * state_change->n_connections;
+}
+
+static int get_initial_state(struct sk_buff *skb, struct netlink_callback *cb)
+{
+       struct drbd_state_change *state_change = (struct drbd_state_change *)cb->args[0];
+       unsigned int seq = cb->args[2];
+       unsigned int n;
+       enum drbd_notification_type flags = 0;
+
+       /* There is no need for taking notification_mutex here: it doesn't
+          matter if the initial state events mix with later state chage
+          events; we can always tell the events apart by the NOTIFY_EXISTS
+          flag. */
+
+       cb->args[5]--;
+       if (cb->args[5] == 1) {
+               notify_initial_state_done(skb, seq);
+               goto out;
+       }
+       n = cb->args[4]++;
+       if (cb->args[4] < cb->args[3])
+               flags |= NOTIFY_CONTINUES;
+       if (n < 1) {
+               notify_resource_state_change(skb, seq, state_change->resource,
+                                            NOTIFY_EXISTS | flags);
+               goto next;
+       }
+       n--;
+       if (n < state_change->n_connections) {
+               notify_connection_state_change(skb, seq, &state_change->connections[n],
+                                              NOTIFY_EXISTS | flags);
+               goto next;
+       }
+       n -= state_change->n_connections;
+       if (n < state_change->n_devices) {
+               notify_device_state_change(skb, seq, &state_change->devices[n],
+                                          NOTIFY_EXISTS | flags);
+               goto next;
+       }
+       n -= state_change->n_devices;
+       if (n < state_change->n_devices * state_change->n_connections) {
+               notify_peer_device_state_change(skb, seq, &state_change->peer_devices[n],
+                                               NOTIFY_EXISTS | flags);
+               goto next;
+       }
+
+next:
+       if (cb->args[4] == cb->args[3]) {
+               struct drbd_state_change *next_state_change =
+                       list_entry(state_change->list.next,
+                                  struct drbd_state_change, list);
+               cb->args[0] = (long)next_state_change;
+               cb->args[3] = notifications_for_state_change(next_state_change);
+               cb->args[4] = 0;
+       }
+out:
+       return skb->len;
+}
+
+int drbd_adm_get_initial_state(struct sk_buff *skb, struct netlink_callback *cb)
+{
+       struct drbd_resource *resource;
+       LIST_HEAD(head);
+
+       if (cb->args[5] >= 1) {
+               if (cb->args[5] > 1)
+                       return get_initial_state(skb, cb);
+               if (cb->args[0]) {
+                       struct drbd_state_change *state_change =
+                               (struct drbd_state_change *)cb->args[0];
+
+                       /* connect list to head */
+                       list_add(&head, &state_change->list);
+                       free_state_changes(&head);
+               }
+               return 0;
+       }
+
+       cb->args[5] = 2;  /* number of iterations */
+       mutex_lock(&resources_mutex);
+       for_each_resource(resource, &drbd_resources) {
+               struct drbd_state_change *state_change;
+
+               state_change = remember_old_state(resource, GFP_KERNEL);
+               if (!state_change) {
+                       if (!list_empty(&head))
+                               free_state_changes(&head);
+                       mutex_unlock(&resources_mutex);
+                       return -ENOMEM;
+               }
+               copy_old_to_new_state_change(state_change);
+               list_add_tail(&state_change->list, &head);
+               cb->args[5] += notifications_for_state_change(state_change);
+       }
+       mutex_unlock(&resources_mutex);
+
+       if (!list_empty(&head)) {
+               struct drbd_state_change *state_change =
+                       list_entry(head.next, struct drbd_state_change, list);
+               cb->args[0] = (long)state_change;
+               cb->args[3] = notifications_for_state_change(state_change);
+               list_del(&head);  /* detach list from head */
+       }
+
+       cb->args[2] = cb->nlh->nlmsg_seq;
+       return get_initial_state(skb, cb);
+}
index 3b10fa6cb039339a2301350862b7d87e95c6a0cb..6537b25db9c1afaafdb35c61b83b6be431109922 100644 (file)
@@ -245,9 +245,9 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
        char wp;
 
        static char write_ordering_chars[] = {
-               [WO_none] = 'n',
-               [WO_drain_io] = 'd',
-               [WO_bdev_flush] = 'f',
+               [WO_NONE] = 'n',
+               [WO_DRAIN_IO] = 'd',
+               [WO_BDEV_FLUSH] = 'f',
        };
 
        seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n",
index 2da9104a3851813010eae268fa778c82ac5b2d4c..ef9245363dccc6183e680083d764404e2e6acd16 100644 (file)
@@ -23,7 +23,7 @@ enum drbd_packet {
        P_AUTH_RESPONSE       = 0x11,
        P_STATE_CHG_REQ       = 0x12,
 
-       /* asender (meta socket */
+       /* (meta socket) */
        P_PING                = 0x13,
        P_PING_ACK            = 0x14,
        P_RECV_ACK            = 0x15, /* Used in protocol B */
index b4b5680ac6adb1dcdbda2428b08398d38e2c8d4b..1957fe8601dcbb60ee2b8d3aa9367104fd503daf 100644 (file)
@@ -215,7 +215,7 @@ static void reclaim_finished_net_peer_reqs(struct drbd_device *device,
        }
 }
 
-static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device)
+static void drbd_reclaim_net_peer_reqs(struct drbd_device *device)
 {
        LIST_HEAD(reclaimed);
        struct drbd_peer_request *peer_req, *t;
@@ -223,11 +223,30 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device)
        spin_lock_irq(&device->resource->req_lock);
        reclaim_finished_net_peer_reqs(device, &reclaimed);
        spin_unlock_irq(&device->resource->req_lock);
-
        list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
                drbd_free_net_peer_req(device, peer_req);
 }
 
+static void conn_reclaim_net_peer_reqs(struct drbd_connection *connection)
+{
+       struct drbd_peer_device *peer_device;
+       int vnr;
+
+       rcu_read_lock();
+       idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+               struct drbd_device *device = peer_device->device;
+               if (!atomic_read(&device->pp_in_use_by_net))
+                       continue;
+
+               kref_get(&device->kref);
+               rcu_read_unlock();
+               drbd_reclaim_net_peer_reqs(device);
+               kref_put(&device->kref, drbd_destroy_device);
+               rcu_read_lock();
+       }
+       rcu_read_unlock();
+}
+
 /**
  * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled)
  * @device:    DRBD device.
@@ -265,10 +284,15 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int
        if (atomic_read(&device->pp_in_use) < mxb)
                page = __drbd_alloc_pages(device, number);
 
+       /* Try to keep the fast path fast, but occasionally we need
+        * to reclaim the pages we lended to the network stack. */
+       if (page && atomic_read(&device->pp_in_use_by_net) > 512)
+               drbd_reclaim_net_peer_reqs(device);
+
        while (page == NULL) {
                prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
 
-               drbd_kick_lo_and_reclaim_net(device);
+               drbd_reclaim_net_peer_reqs(device);
 
                if (atomic_read(&device->pp_in_use) < mxb) {
                        page = __drbd_alloc_pages(device, number);
@@ -1099,7 +1123,15 @@ randomize:
                return 0;
        }
 
-       drbd_thread_start(&connection->asender);
+       drbd_thread_start(&connection->ack_receiver);
+       /* opencoded create_singlethread_workqueue(),
+        * to be able to use format string arguments */
+       connection->ack_sender =
+               alloc_ordered_workqueue("drbd_as_%s", WQ_MEM_RECLAIM, connection->resource->name);
+       if (!connection->ack_sender) {
+               drbd_err(connection, "Failed to create workqueue ack_sender\n");
+               return 0;
+       }
 
        mutex_lock(&connection->resource->conf_update);
        /* The discard_my_data flag is a single-shot modifier to the next
@@ -1178,7 +1210,7 @@ static void drbd_flush(struct drbd_connection *connection)
        struct drbd_peer_device *peer_device;
        int vnr;
 
-       if (connection->resource->write_ordering >= WO_bdev_flush) {
+       if (connection->resource->write_ordering >= WO_BDEV_FLUSH) {
                rcu_read_lock();
                idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
                        struct drbd_device *device = peer_device->device;
@@ -1203,7 +1235,7 @@ static void drbd_flush(struct drbd_connection *connection)
                                /* would rather check on EOPNOTSUPP, but that is not reliable.
                                 * don't try again for ANY return value != 0
                                 * if (rv == -EOPNOTSUPP) */
-                               drbd_bump_write_ordering(connection->resource, NULL, WO_drain_io);
+                               drbd_bump_write_ordering(connection->resource, NULL, WO_DRAIN_IO);
                        }
                        put_ldev(device);
                        kref_put(&device->kref, drbd_destroy_device);
@@ -1299,10 +1331,10 @@ max_allowed_wo(struct drbd_backing_dev *bdev, enum write_ordering_e wo)
 
        dc = rcu_dereference(bdev->disk_conf);
 
-       if (wo == WO_bdev_flush && !dc->disk_flushes)
-               wo = WO_drain_io;
-       if (wo == WO_drain_io && !dc->disk_drain)
-               wo = WO_none;
+       if (wo == WO_BDEV_FLUSH && !dc->disk_flushes)
+               wo = WO_DRAIN_IO;
+       if (wo == WO_DRAIN_IO && !dc->disk_drain)
+               wo = WO_NONE;
 
        return wo;
 }
@@ -1319,13 +1351,13 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin
        enum write_ordering_e pwo;
        int vnr;
        static char *write_ordering_str[] = {
-               [WO_none] = "none",
-               [WO_drain_io] = "drain",
-               [WO_bdev_flush] = "flush",
+               [WO_NONE] = "none",
+               [WO_DRAIN_IO] = "drain",
+               [WO_BDEV_FLUSH] = "flush",
        };
 
        pwo = resource->write_ordering;
-       if (wo != WO_bdev_flush)
+       if (wo != WO_BDEV_FLUSH)
                wo = min(pwo, wo);
        rcu_read_lock();
        idr_for_each_entry(&resource->devices, device, vnr) {
@@ -1343,7 +1375,7 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin
        rcu_read_unlock();
 
        resource->write_ordering = wo;
-       if (pwo != resource->write_ordering || wo == WO_bdev_flush)
+       if (pwo != resource->write_ordering || wo == WO_BDEV_FLUSH)
                drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]);
 }
 
@@ -1380,7 +1412,7 @@ int drbd_submit_peer_request(struct drbd_device *device,
        if (peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) {
                /* wait for all pending IO completions, before we start
                 * zeroing things out. */
-               conn_wait_active_ee_empty(first_peer_device(device)->connection);
+               conn_wait_active_ee_empty(peer_req->peer_device->connection);
                /* add it to the active list now,
                 * so we can find it to present it in debugfs */
                peer_req->submit_jif = jiffies;
@@ -1508,12 +1540,6 @@ static void conn_wait_active_ee_empty(struct drbd_connection *connection)
        rcu_read_unlock();
 }
 
-static struct drbd_peer_device *
-conn_peer_device(struct drbd_connection *connection, int volume_number)
-{
-       return idr_find(&connection->peer_devices, volume_number);
-}
-
 static int receive_Barrier(struct drbd_connection *connection, struct packet_info *pi)
 {
        int rv;
@@ -1533,7 +1559,7 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf
         * Therefore we must send the barrier_ack after the barrier request was
         * completed. */
        switch (connection->resource->write_ordering) {
-       case WO_none:
+       case WO_NONE:
                if (rv == FE_RECYCLED)
                        return 0;
 
@@ -1546,8 +1572,8 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf
                        drbd_warn(connection, "Allocation of an epoch failed, slowing down\n");
                        /* Fall through */
 
-       case WO_bdev_flush:
-       case WO_drain_io:
+       case WO_BDEV_FLUSH:
+       case WO_DRAIN_IO:
                conn_wait_active_ee_empty(connection);
                drbd_flush(connection);
 
@@ -1752,7 +1778,7 @@ static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_req
 }
 
 /*
- * e_end_resync_block() is called in asender context via
+ * e_end_resync_block() is called in ack_sender context via
  * drbd_finish_peer_reqs().
  */
 static int e_end_resync_block(struct drbd_work *w, int unused)
@@ -1926,7 +1952,7 @@ static void restart_conflicting_writes(struct drbd_device *device,
 }
 
 /*
- * e_end_block() is called in asender context via drbd_finish_peer_reqs().
+ * e_end_block() is called in ack_sender context via drbd_finish_peer_reqs().
  */
 static int e_end_block(struct drbd_work *w, int cancel)
 {
@@ -1966,7 +1992,7 @@ static int e_end_block(struct drbd_work *w, int cancel)
        } else
                D_ASSERT(device, drbd_interval_empty(&peer_req->i));
 
-       drbd_may_finish_epoch(first_peer_device(device)->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
+       drbd_may_finish_epoch(peer_device->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
 
        return err;
 }
@@ -2098,7 +2124,7 @@ static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, co
                }
 
                rcu_read_lock();
-               tp = rcu_dereference(first_peer_device(device)->connection->net_conf)->two_primaries;
+               tp = rcu_dereference(peer_device->connection->net_conf)->two_primaries;
                rcu_read_unlock();
 
                if (!tp)
@@ -2217,7 +2243,7 @@ static int handle_write_conflicts(struct drbd_device *device,
                        peer_req->w.cb = superseded ? e_send_superseded :
                                                   e_send_retry_write;
                        list_add_tail(&peer_req->w.list, &device->done_ee);
-                       wake_asender(connection);
+                       queue_work(connection->ack_sender, &peer_req->peer_device->send_acks_work);
 
                        err = -ENOENT;
                        goto out;
@@ -2364,7 +2390,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
        if (dp_flags & DP_SEND_RECEIVE_ACK) {
                /* I really don't like it that the receiver thread
                 * sends on the msock, but anyways */
-               drbd_send_ack(first_peer_device(device), P_RECV_ACK, peer_req);
+               drbd_send_ack(peer_device, P_RECV_ACK, peer_req);
        }
 
        if (tp) {
@@ -4056,7 +4082,7 @@ static int receive_state(struct drbd_connection *connection, struct packet_info
        os = ns = drbd_read_state(device);
        spin_unlock_irq(&device->resource->req_lock);
 
-       /* If some other part of the code (asender thread, timeout)
+       /* If some other part of the code (ack_receiver thread, timeout)
         * already decided to close the connection again,
         * we must not "re-establish" it here. */
        if (os.conn <= C_TEAR_DOWN)
@@ -4661,8 +4687,12 @@ static void conn_disconnect(struct drbd_connection *connection)
         */
        conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
 
-       /* asender does not clean up anything. it must not interfere, either */
-       drbd_thread_stop(&connection->asender);
+       /* ack_receiver does not clean up anything. it must not interfere, either */
+       drbd_thread_stop(&connection->ack_receiver);
+       if (connection->ack_sender) {
+               destroy_workqueue(connection->ack_sender);
+               connection->ack_sender = NULL;
+       }
        drbd_free_sock(connection);
 
        rcu_read_lock();
@@ -5431,49 +5461,39 @@ static int got_skip(struct drbd_connection *connection, struct packet_info *pi)
        return 0;
 }
 
-static int connection_finish_peer_reqs(struct drbd_connection *connection)
+struct meta_sock_cmd {
+       size_t pkt_size;
+       int (*fn)(struct drbd_connection *connection, struct packet_info *);
+};
+
+static void set_rcvtimeo(struct drbd_connection *connection, bool ping_timeout)
 {
-       struct drbd_peer_device *peer_device;
-       int vnr, not_empty = 0;
+       long t;
+       struct net_conf *nc;
 
-       do {
-               clear_bit(SIGNAL_ASENDER, &connection->flags);
-               flush_signals(current);
+       rcu_read_lock();
+       nc = rcu_dereference(connection->net_conf);
+       t = ping_timeout ? nc->ping_timeo : nc->ping_int;
+       rcu_read_unlock();
 
-               rcu_read_lock();
-               idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
-                       struct drbd_device *device = peer_device->device;
-                       kref_get(&device->kref);
-                       rcu_read_unlock();
-                       if (drbd_finish_peer_reqs(device)) {
-                               kref_put(&device->kref, drbd_destroy_device);
-                               return 1;
-                       }
-                       kref_put(&device->kref, drbd_destroy_device);
-                       rcu_read_lock();
-               }
-               set_bit(SIGNAL_ASENDER, &connection->flags);
+       t *= HZ;
+       if (ping_timeout)
+               t /= 10;
 
-               spin_lock_irq(&connection->resource->req_lock);
-               idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
-                       struct drbd_device *device = peer_device->device;
-                       not_empty = !list_empty(&device->done_ee);
-                       if (not_empty)
-                               break;
-               }
-               spin_unlock_irq(&connection->resource->req_lock);
-               rcu_read_unlock();
-       } while (not_empty);
+       connection->meta.socket->sk->sk_rcvtimeo = t;
+}
 
-       return 0;
+static void set_ping_timeout(struct drbd_connection *connection)
+{
+       set_rcvtimeo(connection, 1);
 }
 
-struct asender_cmd {
-       size_t pkt_size;
-       int (*fn)(struct drbd_connection *connection, struct packet_info *);
-};
+static void set_idle_timeout(struct drbd_connection *connection)
+{
+       set_rcvtimeo(connection, 0);
+}
 
-static struct asender_cmd asender_tbl[] = {
+static struct meta_sock_cmd ack_receiver_tbl[] = {
        [P_PING]            = { 0, got_Ping },
        [P_PING_ACK]        = { 0, got_PingAck },
        [P_RECV_ACK]        = { sizeof(struct p_block_ack), got_BlockAck },
@@ -5493,64 +5513,40 @@ static struct asender_cmd asender_tbl[] = {
        [P_RETRY_WRITE]     = { sizeof(struct p_block_ack), got_BlockAck },
 };
 
-int drbd_asender(struct drbd_thread *thi)
+int drbd_ack_receiver(struct drbd_thread *thi)
 {
        struct drbd_connection *connection = thi->connection;
-       struct asender_cmd *cmd = NULL;
+       struct meta_sock_cmd *cmd = NULL;
        struct packet_info pi;
+       unsigned long pre_recv_jif;
        int rv;
        void *buf    = connection->meta.rbuf;
        int received = 0;
        unsigned int header_size = drbd_header_size(connection);
        int expect   = header_size;
        bool ping_timeout_active = false;
-       struct net_conf *nc;
-       int ping_timeo, tcp_cork, ping_int;
        struct sched_param param = { .sched_priority = 2 };
 
        rv = sched_setscheduler(current, SCHED_RR, &param);
        if (rv < 0)
-               drbd_err(connection, "drbd_asender: ERROR set priority, ret=%d\n", rv);
+               drbd_err(connection, "drbd_ack_receiver: ERROR set priority, ret=%d\n", rv);
 
        while (get_t_state(thi) == RUNNING) {
                drbd_thread_current_set_cpu(thi);
 
-               rcu_read_lock();
-               nc = rcu_dereference(connection->net_conf);
-               ping_timeo = nc->ping_timeo;
-               tcp_cork = nc->tcp_cork;
-               ping_int = nc->ping_int;
-               rcu_read_unlock();
+               conn_reclaim_net_peer_reqs(connection);
 
                if (test_and_clear_bit(SEND_PING, &connection->flags)) {
                        if (drbd_send_ping(connection)) {
                                drbd_err(connection, "drbd_send_ping has failed\n");
                                goto reconnect;
                        }
-                       connection->meta.socket->sk->sk_rcvtimeo = ping_timeo * HZ / 10;
+                       set_ping_timeout(connection);
                        ping_timeout_active = true;
                }
 
-               /* TODO: conditionally cork; it may hurt latency if we cork without
-                  much to send */
-               if (tcp_cork)
-                       drbd_tcp_cork(connection->meta.socket);
-               if (connection_finish_peer_reqs(connection)) {
-                       drbd_err(connection, "connection_finish_peer_reqs() failed\n");
-                       goto reconnect;
-               }
-               /* but unconditionally uncork unless disabled */
-               if (tcp_cork)
-                       drbd_tcp_uncork(connection->meta.socket);
-
-               /* short circuit, recv_msg would return EINTR anyways. */
-               if (signal_pending(current))
-                       continue;
-
+               pre_recv_jif = jiffies;
                rv = drbd_recv_short(connection->meta.socket, buf, expect-received, 0);
-               clear_bit(SIGNAL_ASENDER, &connection->flags);
-
-               flush_signals(current);
 
                /* Note:
                 * -EINTR        (on meta) we got a signal
@@ -5562,7 +5558,6 @@ int drbd_asender(struct drbd_thread *thi)
                 * rv <  expected: "woken" by signal during receive
                 * rv == 0       : "connection shut down by peer"
                 */
-received_more:
                if (likely(rv > 0)) {
                        received += rv;
                        buf      += rv;
@@ -5584,8 +5579,7 @@ received_more:
                } else if (rv == -EAGAIN) {
                        /* If the data socket received something meanwhile,
                         * that is good enough: peer is still alive. */
-                       if (time_after(connection->last_received,
-                               jiffies - connection->meta.socket->sk->sk_rcvtimeo))
+                       if (time_after(connection->last_received, pre_recv_jif))
                                continue;
                        if (ping_timeout_active) {
                                drbd_err(connection, "PingAck did not arrive in time.\n");
@@ -5594,6 +5588,10 @@ received_more:
                        set_bit(SEND_PING, &connection->flags);
                        continue;
                } else if (rv == -EINTR) {
+                       /* maybe drbd_thread_stop(): the while condition will notice.
+                        * maybe woken for send_ping: we'll send a ping above,
+                        * and change the rcvtimeo */
+                       flush_signals(current);
                        continue;
                } else {
                        drbd_err(connection, "sock_recvmsg returned %d\n", rv);
@@ -5603,8 +5601,8 @@ received_more:
                if (received == expect && cmd == NULL) {
                        if (decode_header(connection, connection->meta.rbuf, &pi))
                                goto reconnect;
-                       cmd = &asender_tbl[pi.cmd];
-                       if (pi.cmd >= ARRAY_SIZE(asender_tbl) || !cmd->fn) {
+                       cmd = &ack_receiver_tbl[pi.cmd];
+                       if (pi.cmd >= ARRAY_SIZE(ack_receiver_tbl) || !cmd->fn) {
                                drbd_err(connection, "Unexpected meta packet %s (0x%04x)\n",
                                         cmdname(pi.cmd), pi.cmd);
                                goto disconnect;
@@ -5627,9 +5625,8 @@ received_more:
 
                        connection->last_received = jiffies;
 
-                       if (cmd == &asender_tbl[P_PING_ACK]) {
-                               /* restore idle timeout */
-                               connection->meta.socket->sk->sk_rcvtimeo = ping_int * HZ;
+                       if (cmd == &ack_receiver_tbl[P_PING_ACK]) {
+                               set_idle_timeout(connection);
                                ping_timeout_active = false;
                        }
 
@@ -5638,11 +5635,6 @@ received_more:
                        expect   = header_size;
                        cmd      = NULL;
                }
-               if (test_bit(SEND_PING, &connection->flags))
-                       continue;
-               rv = drbd_recv_short(connection->meta.socket, buf, expect-received, MSG_DONTWAIT);
-               if (rv > 0)
-                       goto received_more;
        }
 
        if (0) {
@@ -5654,9 +5646,41 @@ reconnect:
 disconnect:
                conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
        }
-       clear_bit(SIGNAL_ASENDER, &connection->flags);
 
-       drbd_info(connection, "asender terminated\n");
+       drbd_info(connection, "ack_receiver terminated\n");
 
        return 0;
 }
+
+void drbd_send_acks_wf(struct work_struct *ws)
+{
+       struct drbd_peer_device *peer_device =
+               container_of(ws, struct drbd_peer_device, send_acks_work);
+       struct drbd_connection *connection = peer_device->connection;
+       struct drbd_device *device = peer_device->device;
+       struct net_conf *nc;
+       int tcp_cork, err;
+
+       rcu_read_lock();
+       nc = rcu_dereference(connection->net_conf);
+       tcp_cork = nc->tcp_cork;
+       rcu_read_unlock();
+
+       if (tcp_cork)
+               drbd_tcp_cork(connection->meta.socket);
+
+       err = drbd_finish_peer_reqs(device);
+       kref_put(&device->kref, drbd_destroy_device);
+       /* get is in drbd_endio_write_sec_final(). That is necessary to keep the
+          struct work_struct send_acks_work alive, which is in the peer_device object */
+
+       if (err) {
+               conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
+               return;
+       }
+
+       if (tcp_cork)
+               drbd_tcp_uncork(connection->meta.socket);
+
+       return;
+}
index 3ae2c00865635f889e4040d0e218786c237ff3e6..2255dcfebd2b514d2373424718e1ad4831a0c3ab 100644 (file)
@@ -453,12 +453,12 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
                kref_get(&req->kref); /* wait for the DONE */
 
        if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) {
-               /* potentially already completed in the asender thread */
+               /* potentially already completed in the ack_receiver thread */
                if (!(s & RQ_NET_DONE)) {
                        atomic_add(req->i.size >> 9, &device->ap_in_flight);
                        set_if_null_req_not_net_done(peer_device, req);
                }
-               if (s & RQ_NET_PENDING)
+               if (req->rq_state & RQ_NET_PENDING)
                        set_if_null_req_ack_pending(peer_device, req);
        }
 
@@ -1095,6 +1095,24 @@ static bool do_remote_read(struct drbd_request *req)
        return false;
 }
 
+bool drbd_should_do_remote(union drbd_dev_state s)
+{
+       return s.pdsk == D_UP_TO_DATE ||
+               (s.pdsk >= D_INCONSISTENT &&
+                s.conn >= C_WF_BITMAP_T &&
+                s.conn < C_AHEAD);
+       /* Before proto 96 that was >= CONNECTED instead of >= C_WF_BITMAP_T.
+          That is equivalent since before 96 IO was frozen in the C_WF_BITMAP*
+          states. */
+}
+
+static bool drbd_should_send_out_of_sync(union drbd_dev_state s)
+{
+       return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S;
+       /* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary
+          since we enter state C_AHEAD only if proto >= 96 */
+}
+
 /* returns number of connections (== 1, for drbd 8.4)
  * expected to actually write this data,
  * which does NOT include those that we are L_AHEAD for. */
@@ -1149,7 +1167,6 @@ drbd_submit_req_private_bio(struct drbd_request *req)
         * stable storage, and this is a WRITE, we may not even submit
         * this bio. */
        if (get_ldev(device)) {
-               req->pre_submit_jif = jiffies;
                if (drbd_insert_fault(device,
                                      rw == WRITE ? DRBD_FAULT_DT_WR
                                    : rw == READ  ? DRBD_FAULT_DT_RD
@@ -1293,6 +1310,7 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request
                        &device->pending_master_completion[rw == WRITE]);
        if (req->private_bio) {
                /* needs to be marked within the same spinlock */
+               req->pre_submit_jif = jiffies;
                list_add_tail(&req->req_pending_local,
                        &device->pending_completion[rw == WRITE]);
                _req_mod(req, TO_BE_SUBMITTED);
@@ -1513,6 +1531,78 @@ blk_qc_t drbd_make_request(struct request_queue *q, struct bio *bio)
        return BLK_QC_T_NONE;
 }
 
+static bool net_timeout_reached(struct drbd_request *net_req,
+               struct drbd_connection *connection,
+               unsigned long now, unsigned long ent,
+               unsigned int ko_count, unsigned int timeout)
+{
+       struct drbd_device *device = net_req->device;
+
+       if (!time_after(now, net_req->pre_send_jif + ent))
+               return false;
+
+       if (time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent))
+               return false;
+
+       if (net_req->rq_state & RQ_NET_PENDING) {
+               drbd_warn(device, "Remote failed to finish a request within %ums > ko-count (%u) * timeout (%u * 0.1s)\n",
+                       jiffies_to_msecs(now - net_req->pre_send_jif), ko_count, timeout);
+               return true;
+       }
+
+       /* We received an ACK already (or are using protocol A),
+        * but are waiting for the epoch closing barrier ack.
+        * Check if we sent the barrier already.  We should not blame the peer
+        * for being unresponsive, if we did not even ask it yet. */
+       if (net_req->epoch == connection->send.current_epoch_nr) {
+               drbd_warn(device,
+                       "We did not send a P_BARRIER for %ums > ko-count (%u) * timeout (%u * 0.1s); drbd kernel thread blocked?\n",
+                       jiffies_to_msecs(now - net_req->pre_send_jif), ko_count, timeout);
+               return false;
+       }
+
+       /* Worst case: we may have been blocked for whatever reason, then
+        * suddenly are able to send a lot of requests (and epoch separating
+        * barriers) in quick succession.
+        * The timestamp of the net_req may be much too old and not correspond
+        * to the sending time of the relevant unack'ed barrier packet, so
+        * would trigger a spurious timeout.  The latest barrier packet may
+        * have a too recent timestamp to trigger the timeout, potentially miss
+        * a timeout.  Right now we don't have a place to conveniently store
+        * these timestamps.
+        * But in this particular situation, the application requests are still
+        * completed to upper layers, DRBD should still "feel" responsive.
+        * No need yet to kill this connection, it may still recover.
+        * If not, eventually we will have queued enough into the network for
+        * us to block. From that point of view, the timestamp of the last sent
+        * barrier packet is relevant enough.
+        */
+       if (time_after(now, connection->send.last_sent_barrier_jif + ent)) {
+               drbd_warn(device, "Remote failed to answer a P_BARRIER (sent at %lu jif; now=%lu jif) within %ums > ko-count (%u) * timeout (%u * 0.1s)\n",
+                       connection->send.last_sent_barrier_jif, now,
+                       jiffies_to_msecs(now - connection->send.last_sent_barrier_jif), ko_count, timeout);
+               return true;
+       }
+       return false;
+}
+
+/* A request is considered timed out, if
+ * - we have some effective timeout from the configuration,
+ *   with some state restrictions applied,
+ * - the oldest request is waiting for a response from the network
+ *   resp. the local disk,
+ * - the oldest request is in fact older than the effective timeout,
+ * - the connection was established (resp. disk was attached)
+ *   for longer than the timeout already.
+ * Note that for 32bit jiffies and very stable connections/disks,
+ * we may have a wrap around, which is catched by
+ *   !time_in_range(now, last_..._jif, last_..._jif + timeout).
+ *
+ * Side effect: once per 32bit wrap-around interval, which means every
+ * ~198 days with 250 HZ, we have a window where the timeout would need
+ * to expire twice (worst case) to become effective. Good enough.
+ */
+
 void request_timer_fn(unsigned long data)
 {
        struct drbd_device *device = (struct drbd_device *) data;
@@ -1522,11 +1612,14 @@ void request_timer_fn(unsigned long data)
        unsigned long oldest_submit_jif;
        unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */
        unsigned long now;
+       unsigned int ko_count = 0, timeout = 0;
 
        rcu_read_lock();
        nc = rcu_dereference(connection->net_conf);
-       if (nc && device->state.conn >= C_WF_REPORT_PARAMS)
-               ent = nc->timeout * HZ/10 * nc->ko_count;
+       if (nc && device->state.conn >= C_WF_REPORT_PARAMS) {
+               ko_count = nc->ko_count;
+               timeout = nc->timeout;
+       }
 
        if (get_ldev(device)) { /* implicit state.disk >= D_INCONSISTENT */
                dt = rcu_dereference(device->ldev->disk_conf)->disk_timeout * HZ / 10;
@@ -1534,6 +1627,8 @@ void request_timer_fn(unsigned long data)
        }
        rcu_read_unlock();
 
+
+       ent = timeout * HZ/10 * ko_count;
        et = min_not_zero(dt, ent);
 
        if (!et)
@@ -1545,11 +1640,22 @@ void request_timer_fn(unsigned long data)
        spin_lock_irq(&device->resource->req_lock);
        req_read = list_first_entry_or_null(&device->pending_completion[0], struct drbd_request, req_pending_local);
        req_write = list_first_entry_or_null(&device->pending_completion[1], struct drbd_request, req_pending_local);
-       req_peer = connection->req_not_net_done;
+
        /* maybe the oldest request waiting for the peer is in fact still
-        * blocking in tcp sendmsg */
-       if (!req_peer && connection->req_next && connection->req_next->pre_send_jif)
-               req_peer = connection->req_next;
+        * blocking in tcp sendmsg.  That's ok, though, that's handled via the
+        * socket send timeout, requesting a ping, and bumping ko-count in
+        * we_should_drop_the_connection().
+        */
+
+       /* check the oldest request we did successfully sent,
+        * but which is still waiting for an ACK. */
+       req_peer = connection->req_ack_pending;
+
+       /* if we don't have such request (e.g. protocoll A)
+        * check the oldest requests which is still waiting on its epoch
+        * closing barrier ack. */
+       if (!req_peer)
+               req_peer = connection->req_not_net_done;
 
        /* evaluate the oldest peer request only in one timer! */
        if (req_peer && req_peer->device != device)
@@ -1566,28 +1672,9 @@ void request_timer_fn(unsigned long data)
                : req_write ? req_write->pre_submit_jif
                : req_read ? req_read->pre_submit_jif : now;
 
-       /* The request is considered timed out, if
-        * - we have some effective timeout from the configuration,
-        *   with above state restrictions applied,
-        * - the oldest request is waiting for a response from the network
-        *   resp. the local disk,
-        * - the oldest request is in fact older than the effective timeout,
-        * - the connection was established (resp. disk was attached)
-        *   for longer than the timeout already.
-        * Note that for 32bit jiffies and very stable connections/disks,
-        * we may have a wrap around, which is catched by
-        *   !time_in_range(now, last_..._jif, last_..._jif + timeout).
-        *
-        * Side effect: once per 32bit wrap-around interval, which means every
-        * ~198 days with 250 HZ, we have a window where the timeout would need
-        * to expire twice (worst case) to become effective. Good enough.
-        */
-       if (ent && req_peer &&
-                time_after(now, req_peer->pre_send_jif + ent) &&
-               !time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) {
-               drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n");
+       if (ent && req_peer && net_timeout_reached(req_peer, connection, now, ent, ko_count, timeout))
                _conn_request_state(connection, NS(conn, C_TIMEOUT), CS_VERBOSE | CS_HARD);
-       }
+
        if (dt && oldest_submit_jif != now &&
                 time_after(now, oldest_submit_jif + dt) &&
                !time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) {
index 9f6a04080e9f76aadfdfedf8d0e1cb408dbcba2a..bb2ef78165e5fe3b0a9394bbdaec61e1e99e66d4 100644 (file)
@@ -331,21 +331,6 @@ static inline int req_mod(struct drbd_request *req,
        return rv;
 }
 
-static inline bool drbd_should_do_remote(union drbd_dev_state s)
-{
-       return s.pdsk == D_UP_TO_DATE ||
-               (s.pdsk >= D_INCONSISTENT &&
-                s.conn >= C_WF_BITMAP_T &&
-                s.conn < C_AHEAD);
-       /* Before proto 96 that was >= CONNECTED instead of >= C_WF_BITMAP_T.
-          That is equivalent since before 96 IO was frozen in the C_WF_BITMAP*
-          states. */
-}
-static inline bool drbd_should_send_out_of_sync(union drbd_dev_state s)
-{
-       return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S;
-       /* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary
-          since we enter state C_AHEAD only if proto >= 96 */
-}
+extern bool drbd_should_do_remote(union drbd_dev_state);
 
 #endif
index 2d7dd269b6a82a41f94ce044f4baf4c87c92d473..5a7ef7873b675b73023001478be61e746b3d1194 100644 (file)
@@ -29,6 +29,7 @@
 #include "drbd_int.h"
 #include "drbd_protocol.h"
 #include "drbd_req.h"
+#include "drbd_state_change.h"
 
 struct after_state_chg_work {
        struct drbd_work w;
@@ -37,6 +38,7 @@ struct after_state_chg_work {
        union drbd_state ns;
        enum chg_state_flags flags;
        struct completion *done;
+       struct drbd_state_change *state_change;
 };
 
 enum sanitize_state_warnings {
@@ -48,9 +50,248 @@ enum sanitize_state_warnings {
        IMPLICITLY_UPGRADED_PDSK,
 };
 
+static void count_objects(struct drbd_resource *resource,
+                         unsigned int *n_devices,
+                         unsigned int *n_connections)
+{
+       struct drbd_device *device;
+       struct drbd_connection *connection;
+       int vnr;
+
+       *n_devices = 0;
+       *n_connections = 0;
+
+       idr_for_each_entry(&resource->devices, device, vnr)
+               (*n_devices)++;
+       for_each_connection(connection, resource)
+               (*n_connections)++;
+}
+
+static struct drbd_state_change *alloc_state_change(unsigned int n_devices, unsigned int n_connections, gfp_t gfp)
+{
+       struct drbd_state_change *state_change;
+       unsigned int size, n;
+
+       size = sizeof(struct drbd_state_change) +
+              n_devices * sizeof(struct drbd_device_state_change) +
+              n_connections * sizeof(struct drbd_connection_state_change) +
+              n_devices * n_connections * sizeof(struct drbd_peer_device_state_change);
+       state_change = kmalloc(size, gfp);
+       if (!state_change)
+               return NULL;
+       state_change->n_devices = n_devices;
+       state_change->n_connections = n_connections;
+       state_change->devices = (void *)(state_change + 1);
+       state_change->connections = (void *)&state_change->devices[n_devices];
+       state_change->peer_devices = (void *)&state_change->connections[n_connections];
+       state_change->resource->resource = NULL;
+       for (n = 0; n < n_devices; n++)
+               state_change->devices[n].device = NULL;
+       for (n = 0; n < n_connections; n++)
+               state_change->connections[n].connection = NULL;
+       return state_change;
+}
+
+struct drbd_state_change *remember_old_state(struct drbd_resource *resource, gfp_t gfp)
+{
+       struct drbd_state_change *state_change;
+       struct drbd_device *device;
+       unsigned int n_devices;
+       struct drbd_connection *connection;
+       unsigned int n_connections;
+       int vnr;
+
+       struct drbd_device_state_change *device_state_change;
+       struct drbd_peer_device_state_change *peer_device_state_change;
+       struct drbd_connection_state_change *connection_state_change;
+
+       /* Caller holds req_lock spinlock.
+        * No state, no device IDR, no connections lists can change. */
+       count_objects(resource, &n_devices, &n_connections);
+       state_change = alloc_state_change(n_devices, n_connections, gfp);
+       if (!state_change)
+               return NULL;
+
+       kref_get(&resource->kref);
+       state_change->resource->resource = resource;
+       state_change->resource->role[OLD] =
+               conn_highest_role(first_connection(resource));
+       state_change->resource->susp[OLD] = resource->susp;
+       state_change->resource->susp_nod[OLD] = resource->susp_nod;
+       state_change->resource->susp_fen[OLD] = resource->susp_fen;
+
+       connection_state_change = state_change->connections;
+       for_each_connection(connection, resource) {
+               kref_get(&connection->kref);
+               connection_state_change->connection = connection;
+               connection_state_change->cstate[OLD] =
+                       connection->cstate;
+               connection_state_change->peer_role[OLD] =
+                       conn_highest_peer(connection);
+               connection_state_change++;
+       }
+
+       device_state_change = state_change->devices;
+       peer_device_state_change = state_change->peer_devices;
+       idr_for_each_entry(&resource->devices, device, vnr) {
+               kref_get(&device->kref);
+               device_state_change->device = device;
+               device_state_change->disk_state[OLD] = device->state.disk;
+
+               /* The peer_devices for each device have to be enumerated in
+                  the order of the connections. We may not use for_each_peer_device() here. */
+               for_each_connection(connection, resource) {
+                       struct drbd_peer_device *peer_device;
+
+                       peer_device = conn_peer_device(connection, device->vnr);
+                       peer_device_state_change->peer_device = peer_device;
+                       peer_device_state_change->disk_state[OLD] =
+                               device->state.pdsk;
+                       peer_device_state_change->repl_state[OLD] =
+                               max_t(enum drbd_conns,
+                                     C_WF_REPORT_PARAMS, device->state.conn);
+                       peer_device_state_change->resync_susp_user[OLD] =
+                               device->state.user_isp;
+                       peer_device_state_change->resync_susp_peer[OLD] =
+                               device->state.peer_isp;
+                       peer_device_state_change->resync_susp_dependency[OLD] =
+                               device->state.aftr_isp;
+                       peer_device_state_change++;
+               }
+               device_state_change++;
+       }
+
+       return state_change;
+}
+
+static void remember_new_state(struct drbd_state_change *state_change)
+{
+       struct drbd_resource_state_change *resource_state_change;
+       struct drbd_resource *resource;
+       unsigned int n;
+
+       if (!state_change)
+               return;
+
+       resource_state_change = &state_change->resource[0];
+       resource = resource_state_change->resource;
+
+       resource_state_change->role[NEW] =
+               conn_highest_role(first_connection(resource));
+       resource_state_change->susp[NEW] = resource->susp;
+       resource_state_change->susp_nod[NEW] = resource->susp_nod;
+       resource_state_change->susp_fen[NEW] = resource->susp_fen;
+
+       for (n = 0; n < state_change->n_devices; n++) {
+               struct drbd_device_state_change *device_state_change =
+                       &state_change->devices[n];
+               struct drbd_device *device = device_state_change->device;
+
+               device_state_change->disk_state[NEW] = device->state.disk;
+       }
+
+       for (n = 0; n < state_change->n_connections; n++) {
+               struct drbd_connection_state_change *connection_state_change =
+                       &state_change->connections[n];
+               struct drbd_connection *connection =
+                       connection_state_change->connection;
+
+               connection_state_change->cstate[NEW] = connection->cstate;
+               connection_state_change->peer_role[NEW] =
+                       conn_highest_peer(connection);
+       }
+
+       for (n = 0; n < state_change->n_devices * state_change->n_connections; n++) {
+               struct drbd_peer_device_state_change *peer_device_state_change =
+                       &state_change->peer_devices[n];
+               struct drbd_device *device =
+                       peer_device_state_change->peer_device->device;
+               union drbd_dev_state state = device->state;
+
+               peer_device_state_change->disk_state[NEW] = state.pdsk;
+               peer_device_state_change->repl_state[NEW] =
+                       max_t(enum drbd_conns, C_WF_REPORT_PARAMS, state.conn);
+               peer_device_state_change->resync_susp_user[NEW] =
+                       state.user_isp;
+               peer_device_state_change->resync_susp_peer[NEW] =
+                       state.peer_isp;
+               peer_device_state_change->resync_susp_dependency[NEW] =
+                       state.aftr_isp;
+       }
+}
+
+void copy_old_to_new_state_change(struct drbd_state_change *state_change)
+{
+       struct drbd_resource_state_change *resource_state_change = &state_change->resource[0];
+       unsigned int n_device, n_connection, n_peer_device, n_peer_devices;
+
+#define OLD_TO_NEW(x) \
+       (x[NEW] = x[OLD])
+
+       OLD_TO_NEW(resource_state_change->role);
+       OLD_TO_NEW(resource_state_change->susp);
+       OLD_TO_NEW(resource_state_change->susp_nod);
+       OLD_TO_NEW(resource_state_change->susp_fen);
+
+       for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) {
+               struct drbd_connection_state_change *connection_state_change =
+                               &state_change->connections[n_connection];
+
+               OLD_TO_NEW(connection_state_change->peer_role);
+               OLD_TO_NEW(connection_state_change->cstate);
+       }
+
+       for (n_device = 0; n_device < state_change->n_devices; n_device++) {
+               struct drbd_device_state_change *device_state_change =
+                       &state_change->devices[n_device];
+
+               OLD_TO_NEW(device_state_change->disk_state);
+       }
+
+       n_peer_devices = state_change->n_devices * state_change->n_connections;
+       for (n_peer_device = 0; n_peer_device < n_peer_devices; n_peer_device++) {
+               struct drbd_peer_device_state_change *p =
+                       &state_change->peer_devices[n_peer_device];
+
+               OLD_TO_NEW(p->disk_state);
+               OLD_TO_NEW(p->repl_state);
+               OLD_TO_NEW(p->resync_susp_user);
+               OLD_TO_NEW(p->resync_susp_peer);
+               OLD_TO_NEW(p->resync_susp_dependency);
+       }
+
+#undef OLD_TO_NEW
+}
+
+void forget_state_change(struct drbd_state_change *state_change)
+{
+       unsigned int n;
+
+       if (!state_change)
+               return;
+
+       if (state_change->resource->resource)
+               kref_put(&state_change->resource->resource->kref, drbd_destroy_resource);
+       for (n = 0; n < state_change->n_devices; n++) {
+               struct drbd_device *device = state_change->devices[n].device;
+
+               if (device)
+                       kref_put(&device->kref, drbd_destroy_device);
+       }
+       for (n = 0; n < state_change->n_connections; n++) {
+               struct drbd_connection *connection =
+                       state_change->connections[n].connection;
+
+               if (connection)
+                       kref_put(&connection->kref, drbd_destroy_connection);
+       }
+       kfree(state_change);
+}
+
 static int w_after_state_ch(struct drbd_work *w, int unused);
 static void after_state_ch(struct drbd_device *device, union drbd_state os,
-                          union drbd_state ns, enum chg_state_flags flags);
+                          union drbd_state ns, enum chg_state_flags flags,
+                          struct drbd_state_change *);
 static enum drbd_state_rv is_valid_state(struct drbd_device *, union drbd_state);
 static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_connection *);
 static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns);
@@ -93,6 +334,7 @@ static enum drbd_role max_role(enum drbd_role role1, enum drbd_role role2)
                return R_SECONDARY;
        return R_UNKNOWN;
 }
+
 static enum drbd_role min_role(enum drbd_role role1, enum drbd_role role2)
 {
        if (role1 == R_UNKNOWN || role2 == R_UNKNOWN)
@@ -937,7 +1179,7 @@ void drbd_resume_al(struct drbd_device *device)
                drbd_info(device, "Resumed AL updates\n");
 }
 
-/* helper for __drbd_set_state */
+/* helper for _drbd_set_state */
 static void set_ov_position(struct drbd_device *device, enum drbd_conns cs)
 {
        if (first_peer_device(device)->connection->agreed_pro_version < 90)
@@ -965,17 +1207,17 @@ static void set_ov_position(struct drbd_device *device, enum drbd_conns cs)
 }
 
 /**
- * __drbd_set_state() - Set a new DRBD state
+ * _drbd_set_state() - Set a new DRBD state
  * @device:    DRBD device.
  * @ns:                new state.
  * @flags:     Flags
  * @done:      Optional completion, that will get completed after the after_state_ch() finished
  *
- * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
+ * Caller needs to hold req_lock. Do not call directly.
  */
 enum drbd_state_rv
-__drbd_set_state(struct drbd_device *device, union drbd_state ns,
-                enum chg_state_flags flags, struct completion *done)
+_drbd_set_state(struct drbd_device *device, union drbd_state ns,
+               enum chg_state_flags flags, struct completion *done)
 {
        struct drbd_peer_device *peer_device = first_peer_device(device);
        struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
@@ -983,6 +1225,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
        enum drbd_state_rv rv = SS_SUCCESS;
        enum sanitize_state_warnings ssw;
        struct after_state_chg_work *ascw;
+       struct drbd_state_change *state_change;
 
        os = drbd_read_state(device);
 
@@ -1037,6 +1280,9 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
        if (!is_sync_state(os.conn) && is_sync_state(ns.conn))
                clear_bit(RS_DONE, &device->flags);
 
+       /* FIXME: Have any flags been set earlier in this function already? */
+       state_change = remember_old_state(device->resource, GFP_ATOMIC);
+
        /* changes to local_cnt and device flags should be visible before
         * changes to state, which again should be visible before anything else
         * depending on that change happens. */
@@ -1047,6 +1293,8 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
        device->resource->susp_fen = ns.susp_fen;
        smp_wmb();
 
+       remember_new_state(state_change);
+
        /* put replicated vs not-replicated requests in seperate epochs */
        if (drbd_should_do_remote((union drbd_dev_state)os.i) !=
            drbd_should_do_remote((union drbd_dev_state)ns.i))
@@ -1184,6 +1432,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
                ascw->w.cb = w_after_state_ch;
                ascw->device = device;
                ascw->done = done;
+               ascw->state_change = state_change;
                drbd_queue_work(&connection->sender_work,
                                &ascw->w);
        } else {
@@ -1199,7 +1448,8 @@ static int w_after_state_ch(struct drbd_work *w, int unused)
                container_of(w, struct after_state_chg_work, w);
        struct drbd_device *device = ascw->device;
 
-       after_state_ch(device, ascw->os, ascw->ns, ascw->flags);
+       after_state_ch(device, ascw->os, ascw->ns, ascw->flags, ascw->state_change);
+       forget_state_change(ascw->state_change);
        if (ascw->flags & CS_WAIT_COMPLETE)
                complete(ascw->done);
        kfree(ascw);
@@ -1234,7 +1484,7 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device,
        D_ASSERT(device, current == first_peer_device(device)->connection->worker.task);
 
        /* open coded non-blocking drbd_suspend_io(device); */
-       set_bit(SUSPEND_IO, &device->flags);
+       atomic_inc(&device->suspend_cnt);
 
        drbd_bm_lock(device, why, flags);
        rv = io_fn(device);
@@ -1245,6 +1495,139 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device,
        return rv;
 }
 
+void notify_resource_state_change(struct sk_buff *skb,
+                                 unsigned int seq,
+                                 struct drbd_resource_state_change *resource_state_change,
+                                 enum drbd_notification_type type)
+{
+       struct drbd_resource *resource = resource_state_change->resource;
+       struct resource_info resource_info = {
+               .res_role = resource_state_change->role[NEW],
+               .res_susp = resource_state_change->susp[NEW],
+               .res_susp_nod = resource_state_change->susp_nod[NEW],
+               .res_susp_fen = resource_state_change->susp_fen[NEW],
+       };
+
+       notify_resource_state(skb, seq, resource, &resource_info, type);
+}
+
+void notify_connection_state_change(struct sk_buff *skb,
+                                   unsigned int seq,
+                                   struct drbd_connection_state_change *connection_state_change,
+                                   enum drbd_notification_type type)
+{
+       struct drbd_connection *connection = connection_state_change->connection;
+       struct connection_info connection_info = {
+               .conn_connection_state = connection_state_change->cstate[NEW],
+               .conn_role = connection_state_change->peer_role[NEW],
+       };
+
+       notify_connection_state(skb, seq, connection, &connection_info, type);
+}
+
+void notify_device_state_change(struct sk_buff *skb,
+                               unsigned int seq,
+                               struct drbd_device_state_change *device_state_change,
+                               enum drbd_notification_type type)
+{
+       struct drbd_device *device = device_state_change->device;
+       struct device_info device_info = {
+               .dev_disk_state = device_state_change->disk_state[NEW],
+       };
+
+       notify_device_state(skb, seq, device, &device_info, type);
+}
+
+void notify_peer_device_state_change(struct sk_buff *skb,
+                                    unsigned int seq,
+                                    struct drbd_peer_device_state_change *p,
+                                    enum drbd_notification_type type)
+{
+       struct drbd_peer_device *peer_device = p->peer_device;
+       struct peer_device_info peer_device_info = {
+               .peer_repl_state = p->repl_state[NEW],
+               .peer_disk_state = p->disk_state[NEW],
+               .peer_resync_susp_user = p->resync_susp_user[NEW],
+               .peer_resync_susp_peer = p->resync_susp_peer[NEW],
+               .peer_resync_susp_dependency = p->resync_susp_dependency[NEW],
+       };
+
+       notify_peer_device_state(skb, seq, peer_device, &peer_device_info, type);
+}
+
+static void broadcast_state_change(struct drbd_state_change *state_change)
+{
+       struct drbd_resource_state_change *resource_state_change = &state_change->resource[0];
+       bool resource_state_has_changed;
+       unsigned int n_device, n_connection, n_peer_device, n_peer_devices;
+       void (*last_func)(struct sk_buff *, unsigned int, void *,
+                         enum drbd_notification_type) = NULL;
+       void *uninitialized_var(last_arg);
+
+#define HAS_CHANGED(state) ((state)[OLD] != (state)[NEW])
+#define FINAL_STATE_CHANGE(type) \
+       ({ if (last_func) \
+               last_func(NULL, 0, last_arg, type); \
+       })
+#define REMEMBER_STATE_CHANGE(func, arg, type) \
+       ({ FINAL_STATE_CHANGE(type | NOTIFY_CONTINUES); \
+          last_func = (typeof(last_func))func; \
+          last_arg = arg; \
+        })
+
+       mutex_lock(&notification_mutex);
+
+       resource_state_has_changed =
+           HAS_CHANGED(resource_state_change->role) ||
+           HAS_CHANGED(resource_state_change->susp) ||
+           HAS_CHANGED(resource_state_change->susp_nod) ||
+           HAS_CHANGED(resource_state_change->susp_fen);
+
+       if (resource_state_has_changed)
+               REMEMBER_STATE_CHANGE(notify_resource_state_change,
+                                     resource_state_change, NOTIFY_CHANGE);
+
+       for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) {
+               struct drbd_connection_state_change *connection_state_change =
+                               &state_change->connections[n_connection];
+
+               if (HAS_CHANGED(connection_state_change->peer_role) ||
+                   HAS_CHANGED(connection_state_change->cstate))
+                       REMEMBER_STATE_CHANGE(notify_connection_state_change,
+                                             connection_state_change, NOTIFY_CHANGE);
+       }
+
+       for (n_device = 0; n_device < state_change->n_devices; n_device++) {
+               struct drbd_device_state_change *device_state_change =
+                       &state_change->devices[n_device];
+
+               if (HAS_CHANGED(device_state_change->disk_state))
+                       REMEMBER_STATE_CHANGE(notify_device_state_change,
+                                             device_state_change, NOTIFY_CHANGE);
+       }
+
+       n_peer_devices = state_change->n_devices * state_change->n_connections;
+       for (n_peer_device = 0; n_peer_device < n_peer_devices; n_peer_device++) {
+               struct drbd_peer_device_state_change *p =
+                       &state_change->peer_devices[n_peer_device];
+
+               if (HAS_CHANGED(p->disk_state) ||
+                   HAS_CHANGED(p->repl_state) ||
+                   HAS_CHANGED(p->resync_susp_user) ||
+                   HAS_CHANGED(p->resync_susp_peer) ||
+                   HAS_CHANGED(p->resync_susp_dependency))
+                       REMEMBER_STATE_CHANGE(notify_peer_device_state_change,
+                                             p, NOTIFY_CHANGE);
+       }
+
+       FINAL_STATE_CHANGE(NOTIFY_CHANGE);
+       mutex_unlock(&notification_mutex);
+
+#undef HAS_CHANGED
+#undef FINAL_STATE_CHANGE
+#undef REMEMBER_STATE_CHANGE
+}
+
 /**
  * after_state_ch() - Perform after state change actions that may sleep
  * @device:    DRBD device.
@@ -1253,13 +1636,16 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device,
  * @flags:     Flags
  */
 static void after_state_ch(struct drbd_device *device, union drbd_state os,
-                          union drbd_state ns, enum chg_state_flags flags)
+                          union drbd_state ns, enum chg_state_flags flags,
+                          struct drbd_state_change *state_change)
 {
        struct drbd_resource *resource = device->resource;
        struct drbd_peer_device *peer_device = first_peer_device(device);
        struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
        struct sib_info sib;
 
+       broadcast_state_change(state_change);
+
        sib.sib_reason = SIB_STATE_CHANGE;
        sib.os = os;
        sib.ns = ns;
@@ -1377,7 +1763,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
        }
 
        if (ns.pdsk < D_INCONSISTENT && get_ldev(device)) {
-               if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
+               if (os.peer != R_PRIMARY && ns.peer == R_PRIMARY &&
                    device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
                        drbd_uuid_new_current(device);
                        drbd_send_uuids(peer_device);
@@ -1444,7 +1830,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
        if (os.disk != D_FAILED && ns.disk == D_FAILED) {
                enum drbd_io_error_p eh = EP_PASS_ON;
                int was_io_error = 0;
-               /* corresponding get_ldev was in __drbd_set_state, to serialize
+               /* corresponding get_ldev was in _drbd_set_state, to serialize
                 * our cleanup here with the transition to D_DISKLESS.
                 * But is is still not save to dreference ldev here, since
                 * we might come from an failed Attach before ldev was set. */
@@ -1455,6 +1841,10 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 
                        was_io_error = test_and_clear_bit(WAS_IO_ERROR, &device->flags);
 
+                       /* Intentionally call this handler first, before drbd_send_state().
+                        * See: 2932204 drbd: call local-io-error handler early
+                        * People may chose to hard-reset the box from this handler.
+                        * It is useful if this looks like a "regular node crash". */
                        if (was_io_error && eh == EP_CALL_HELPER)
                                drbd_khelper(device, "local-io-error");
 
@@ -1572,6 +1962,7 @@ struct after_conn_state_chg_work {
        union drbd_state ns_max; /* new, max state, over all devices */
        enum chg_state_flags flags;
        struct drbd_connection *connection;
+       struct drbd_state_change *state_change;
 };
 
 static int w_after_conn_state_ch(struct drbd_work *w, int unused)
@@ -1584,6 +1975,8 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused)
        struct drbd_peer_device *peer_device;
        int vnr;
 
+       broadcast_state_change(acscw->state_change);
+       forget_state_change(acscw->state_change);
        kfree(acscw);
 
        /* Upon network configuration, we need to start the receiver */
@@ -1593,6 +1986,13 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused)
        if (oc == C_DISCONNECTING && ns_max.conn == C_STANDALONE) {
                struct net_conf *old_conf;
 
+               mutex_lock(&notification_mutex);
+               idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+                       notify_peer_device_state(NULL, 0, peer_device, NULL,
+                                                NOTIFY_DESTROY | NOTIFY_CONTINUES);
+               notify_connection_state(NULL, 0, connection, NULL, NOTIFY_DESTROY);
+               mutex_unlock(&notification_mutex);
+
                mutex_lock(&connection->resource->conf_update);
                old_conf = connection->net_conf;
                connection->my_addr_len = 0;
@@ -1759,7 +2159,7 @@ conn_set_state(struct drbd_connection *connection, union drbd_state mask, union
                if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED)
                        ns.disk = os.disk;
 
-               rv = __drbd_set_state(device, ns, flags, NULL);
+               rv = _drbd_set_state(device, ns, flags, NULL);
                if (rv < SS_SUCCESS)
                        BUG();
 
@@ -1823,6 +2223,7 @@ _conn_request_state(struct drbd_connection *connection, union drbd_state mask, u
        enum drbd_conns oc = connection->cstate;
        union drbd_state ns_max, ns_min, os;
        bool have_mutex = false;
+       struct drbd_state_change *state_change;
 
        if (mask.conn) {
                rv = is_valid_conn_transition(oc, val.conn);
@@ -1868,10 +2269,12 @@ _conn_request_state(struct drbd_connection *connection, union drbd_state mask, u
                        goto abort;
        }
 
+       state_change = remember_old_state(connection->resource, GFP_ATOMIC);
        conn_old_common_state(connection, &os, &flags);
        flags |= CS_DC_SUSP;
        conn_set_state(connection, mask, val, &ns_min, &ns_max, flags);
        conn_pr_state_change(connection, os, ns_max, flags);
+       remember_new_state(state_change);
 
        acscw = kmalloc(sizeof(*acscw), GFP_ATOMIC);
        if (acscw) {
@@ -1882,6 +2285,7 @@ _conn_request_state(struct drbd_connection *connection, union drbd_state mask, u
                acscw->w.cb = w_after_conn_state_ch;
                kref_get(&connection->kref);
                acscw->connection = connection;
+               acscw->state_change = state_change;
                drbd_queue_work(&connection->sender_work, &acscw->w);
        } else {
                drbd_err(connection, "Could not kmalloc an acscw\n");
index 7f53c40823cd581e28279b9ef046a07af54a974e..bd989536f888fc05af96942858795dc34713045f 100644 (file)
@@ -122,9 +122,9 @@ extern enum drbd_state_rv
 _drbd_request_state_holding_state_mutex(struct drbd_device *, union drbd_state,
                                        union drbd_state, enum chg_state_flags);
 
-extern enum drbd_state_rv __drbd_set_state(struct drbd_device *, union drbd_state,
-                                          enum chg_state_flags,
-                                          struct completion *done);
+extern enum drbd_state_rv _drbd_set_state(struct drbd_device *, union drbd_state,
+                                         enum chg_state_flags,
+                                         struct completion *done);
 extern void print_st_err(struct drbd_device *, union drbd_state,
                        union drbd_state, int);
 
diff --git a/drivers/block/drbd/drbd_state_change.h b/drivers/block/drbd/drbd_state_change.h
new file mode 100644 (file)
index 0000000..9e503a1
--- /dev/null
@@ -0,0 +1,63 @@
+#ifndef DRBD_STATE_CHANGE_H
+#define DRBD_STATE_CHANGE_H
+
+struct drbd_resource_state_change {
+       struct drbd_resource *resource;
+       enum drbd_role role[2];
+       bool susp[2];
+       bool susp_nod[2];
+       bool susp_fen[2];
+};
+
+struct drbd_device_state_change {
+       struct drbd_device *device;
+       enum drbd_disk_state disk_state[2];
+};
+
+struct drbd_connection_state_change {
+       struct drbd_connection *connection;
+       enum drbd_conns cstate[2];  /* drbd9: enum drbd_conn_state */
+       enum drbd_role peer_role[2];
+};
+
+struct drbd_peer_device_state_change {
+       struct drbd_peer_device *peer_device;
+       enum drbd_disk_state disk_state[2];
+       enum drbd_conns repl_state[2];  /* drbd9: enum drbd_repl_state */
+       bool resync_susp_user[2];
+       bool resync_susp_peer[2];
+       bool resync_susp_dependency[2];
+};
+
+struct drbd_state_change {
+       struct list_head list;
+       unsigned int n_devices;
+       unsigned int n_connections;
+       struct drbd_resource_state_change resource[1];
+       struct drbd_device_state_change *devices;
+       struct drbd_connection_state_change *connections;
+       struct drbd_peer_device_state_change *peer_devices;
+};
+
+extern struct drbd_state_change *remember_old_state(struct drbd_resource *, gfp_t);
+extern void copy_old_to_new_state_change(struct drbd_state_change *);
+extern void forget_state_change(struct drbd_state_change *);
+
+extern void notify_resource_state_change(struct sk_buff *,
+                                        unsigned int,
+                                        struct drbd_resource_state_change *,
+                                        enum drbd_notification_type type);
+extern void notify_connection_state_change(struct sk_buff *,
+                                          unsigned int,
+                                          struct drbd_connection_state_change *,
+                                          enum drbd_notification_type type);
+extern void notify_device_state_change(struct sk_buff *,
+                                      unsigned int,
+                                      struct drbd_device_state_change *,
+                                      enum drbd_notification_type type);
+extern void notify_peer_device_state_change(struct sk_buff *,
+                                           unsigned int,
+                                           struct drbd_peer_device_state_change *,
+                                           enum drbd_notification_type type);
+
+#endif  /* DRBD_STATE_CHANGE_H */
index 5578c1477ba6610f27f80e4cffaa874939a05364..eff716c27b1fdf53caa7440ac5bad755c271ad43 100644 (file)
@@ -55,13 +55,6 @@ static int make_resync_request(struct drbd_device *, int);
  *
  */
 
-
-/* About the global_state_lock
-   Each state transition on an device holds a read lock. In case we have
-   to evaluate the resync after dependencies, we grab a write lock, because
-   we need stable states on all devices for that.  */
-rwlock_t global_state_lock;
-
 /* used for synchronous meta data and bitmap IO
  * submitted by drbd_md_sync_page_io()
  */
@@ -120,6 +113,7 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l
        unsigned long flags = 0;
        struct drbd_peer_device *peer_device = peer_req->peer_device;
        struct drbd_device *device = peer_device->device;
+       struct drbd_connection *connection = peer_device->connection;
        struct drbd_interval i;
        int do_wake;
        u64 block_id;
@@ -152,6 +146,12 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l
         * ((peer_req->flags & (EE_WAS_ERROR|EE_IS_TRIM)) == EE_WAS_ERROR) */
        if (peer_req->flags & EE_WAS_ERROR)
                __drbd_chk_io_error(device, DRBD_WRITE_ERROR);
+
+       if (connection->cstate >= C_WF_REPORT_PARAMS) {
+               kref_get(&device->kref); /* put is in drbd_send_acks_wf() */
+               if (!queue_work(connection->ack_sender, &peer_device->send_acks_work))
+                       kref_put(&device->kref, drbd_destroy_device);
+       }
        spin_unlock_irqrestore(&device->resource->req_lock, flags);
 
        if (block_id == ID_SYNCER)
@@ -163,7 +163,6 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l
        if (do_al_complete_io)
                drbd_al_complete_io(device, &i);
 
-       wake_asender(peer_device->connection);
        put_ldev(device);
 }
 
@@ -195,6 +194,12 @@ void drbd_peer_request_endio(struct bio *bio)
        }
 }
 
+void drbd_panic_after_delayed_completion_of_aborted_request(struct drbd_device *device)
+{
+       panic("drbd%u %s/%u potential random memory corruption caused by delayed completion of aborted local request\n",
+               device->minor, device->resource->name, device->vnr);
+}
+
 /* read, readA or write requests on R_PRIMARY coming from drbd_make_request
  */
 void drbd_request_endio(struct bio *bio)
@@ -238,7 +243,7 @@ void drbd_request_endio(struct bio *bio)
                        drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n");
 
                if (!bio->bi_error)
-                       panic("possible random memory corruption caused by delayed completion of aborted local request\n");
+                       drbd_panic_after_delayed_completion_of_aborted_request(device);
        }
 
        /* to avoid recursion in __req_mod */
@@ -1291,6 +1296,7 @@ static int drbd_send_barrier(struct drbd_connection *connection)
        p->barrier = connection->send.current_epoch_nr;
        p->pad = 0;
        connection->send.current_epoch_writes = 0;
+       connection->send.last_sent_barrier_jif = jiffies;
 
        return conn_send_command(connection, sock, P_BARRIER, sizeof(*p), NULL, 0);
 }
@@ -1315,6 +1321,7 @@ static void re_init_if_first_write(struct drbd_connection *connection, unsigned
                connection->send.seen_any_write_yet = true;
                connection->send.current_epoch_nr = epoch;
                connection->send.current_epoch_writes = 0;
+               connection->send.last_sent_barrier_jif = jiffies;
        }
 }
 
@@ -1456,70 +1463,73 @@ static int _drbd_may_sync_now(struct drbd_device *device)
 }
 
 /**
- * _drbd_pause_after() - Pause resync on all devices that may not resync now
+ * drbd_pause_after() - Pause resync on all devices that may not resync now
  * @device:    DRBD device.
  *
  * Called from process context only (admin command and after_state_ch).
  */
-static int _drbd_pause_after(struct drbd_device *device)
+static bool drbd_pause_after(struct drbd_device *device)
 {
+       bool changed = false;
        struct drbd_device *odev;
-       int i, rv = 0;
+       int i;
 
        rcu_read_lock();
        idr_for_each_entry(&drbd_devices, odev, i) {
                if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
                        continue;
-               if (!_drbd_may_sync_now(odev))
-                       rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)
-                              != SS_NOTHING_TO_DO);
+               if (!_drbd_may_sync_now(odev) &&
+                   _drbd_set_state(_NS(odev, aftr_isp, 1),
+                                   CS_HARD, NULL) != SS_NOTHING_TO_DO)
+                       changed = true;
        }
        rcu_read_unlock();
 
-       return rv;
+       return changed;
 }
 
 /**
- * _drbd_resume_next() - Resume resync on all devices that may resync now
+ * drbd_resume_next() - Resume resync on all devices that may resync now
  * @device:    DRBD device.
  *
  * Called from process context only (admin command and worker).
  */
-static int _drbd_resume_next(struct drbd_device *device)
+static bool drbd_resume_next(struct drbd_device *device)
 {
+       bool changed = false;
        struct drbd_device *odev;
-       int i, rv = 0;
+       int i;
 
        rcu_read_lock();
        idr_for_each_entry(&drbd_devices, odev, i) {
                if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
                        continue;
                if (odev->state.aftr_isp) {
-                       if (_drbd_may_sync_now(odev))
-                               rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0),
-                                                       CS_HARD, NULL)
-                                      != SS_NOTHING_TO_DO) ;
+                       if (_drbd_may_sync_now(odev) &&
+                           _drbd_set_state(_NS(odev, aftr_isp, 0),
+                                           CS_HARD, NULL) != SS_NOTHING_TO_DO)
+                               changed = true;
                }
        }
        rcu_read_unlock();
-       return rv;
+       return changed;
 }
 
 void resume_next_sg(struct drbd_device *device)
 {
-       write_lock_irq(&global_state_lock);
-       _drbd_resume_next(device);
-       write_unlock_irq(&global_state_lock);
+       lock_all_resources();
+       drbd_resume_next(device);
+       unlock_all_resources();
 }
 
 void suspend_other_sg(struct drbd_device *device)
 {
-       write_lock_irq(&global_state_lock);
-       _drbd_pause_after(device);
-       write_unlock_irq(&global_state_lock);
+       lock_all_resources();
+       drbd_pause_after(device);
+       unlock_all_resources();
 }
 
-/* caller must hold global_state_lock */
+/* caller must lock_all_resources() */
 enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor)
 {
        struct drbd_device *odev;
@@ -1557,15 +1567,15 @@ enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_min
        }
 }
 
-/* caller must hold global_state_lock */
+/* caller must lock_all_resources() */
 void drbd_resync_after_changed(struct drbd_device *device)
 {
-       int changes;
+       int changed;
 
        do {
-               changes  = _drbd_pause_after(device);
-               changes |= _drbd_resume_next(device);
-       } while (changes);
+               changed  = drbd_pause_after(device);
+               changed |= drbd_resume_next(device);
+       } while (changed);
 }
 
 void drbd_rs_controller_reset(struct drbd_device *device)
@@ -1685,19 +1695,14 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
        } else {
                mutex_lock(device->state_mutex);
        }
-       clear_bit(B_RS_H_DONE, &device->flags);
 
-       /* req_lock: serialize with drbd_send_and_submit() and others
-        * global_state_lock: for stable sync-after dependencies */
-       spin_lock_irq(&device->resource->req_lock);
-       write_lock(&global_state_lock);
+       lock_all_resources();
+       clear_bit(B_RS_H_DONE, &device->flags);
        /* Did some connection breakage or IO error race with us? */
        if (device->state.conn < C_CONNECTED
        || !get_ldev_if_state(device, D_NEGOTIATING)) {
-               write_unlock(&global_state_lock);
-               spin_unlock_irq(&device->resource->req_lock);
-               mutex_unlock(device->state_mutex);
-               return;
+               unlock_all_resources();
+               goto out;
        }
 
        ns = drbd_read_state(device);
@@ -1711,7 +1716,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
        else /* side == C_SYNC_SOURCE */
                ns.pdsk = D_INCONSISTENT;
 
-       r = __drbd_set_state(device, ns, CS_VERBOSE, NULL);
+       r = _drbd_set_state(device, ns, CS_VERBOSE, NULL);
        ns = drbd_read_state(device);
 
        if (ns.conn < C_CONNECTED)
@@ -1732,7 +1737,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
                        device->rs_mark_left[i] = tw;
                        device->rs_mark_time[i] = now;
                }
-               _drbd_pause_after(device);
+               drbd_pause_after(device);
                /* Forget potentially stale cached per resync extent bit-counts.
                 * Open coded drbd_rs_cancel_all(device), we already have IRQs
                 * disabled, and know the disk state is ok. */
@@ -1742,8 +1747,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
                device->resync_wenr = LC_FREE;
                spin_unlock(&device->al_lock);
        }
-       write_unlock(&global_state_lock);
-       spin_unlock_irq(&device->resource->req_lock);
+       unlock_all_resources();
 
        if (r == SS_SUCCESS) {
                wake_up(&device->al_wait); /* for lc_reset() above */
@@ -1807,6 +1811,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
                drbd_md_sync(device);
        }
        put_ldev(device);
+out:
        mutex_unlock(device->state_mutex);
 }
 
@@ -1836,7 +1841,7 @@ static void drbd_ldev_destroy(struct drbd_device *device)
        device->act_log = NULL;
 
        __acquire(local);
-       drbd_free_ldev(device->ldev);
+       drbd_backing_dev_free(device, device->ldev);
        device->ldev = NULL;
        __release(local);
 
index 15bec407ac37c214855aeaa69e0c45ecbe25f0e6..9b180dbbd03cc822c776e636183614bf59a76d3d 100644 (file)
 /* Device instance number, incremented each time a device is probed. */
 static int instance;
 
-struct list_head online_list;
-struct list_head removing_list;
-spinlock_t dev_lock;
+static struct list_head online_list;
+static struct list_head removing_list;
+static spinlock_t dev_lock;
 
 /*
  * Global variable used to hold the major block device number
index 95dff91135ad6396aa2467ae1093d784b66aa020..8ba1e97d573c3b804c2ee7d14946f94b4c32fc93 100644 (file)
@@ -436,9 +436,8 @@ static void null_del_dev(struct nullb *nullb)
 static void null_lnvm_end_io(struct request *rq, int error)
 {
        struct nvm_rq *rqd = rq->end_io_data;
-       struct nvm_dev *dev = rqd->dev;
 
-       dev->mt->end_io(rqd, error);
+       nvm_end_io(rqd, error);
 
        blk_put_request(rq);
 }
@@ -495,17 +494,17 @@ static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id)
        id->ppaf.ch_offset = 56;
        id->ppaf.ch_len = 8;
 
-       do_div(size, bs); /* convert size to pages */
-       do_div(size, 256); /* concert size to pgs pr blk */
+       sector_div(size, bs); /* convert size to pages */
+       size >>= 8; /* concert size to pgs pr blk */
        grp = &id->groups[0];
        grp->mtype = 0;
        grp->fmtype = 0;
        grp->num_ch = 1;
        grp->num_pg = 256;
        blksize = size;
-       do_div(size, (1 << 16));
+       size >>= 16;
        grp->num_lun = size + 1;
-       do_div(blksize, grp->num_lun);
+       sector_div(blksize, grp->num_lun);
        grp->num_blk = blksize;
        grp->num_pln = 1;
 
index 81ea69fee7ca183313b8e8322833062262279187..4a876785b68cd5c550dbbb1d2b63071446454081 100644 (file)
@@ -5185,8 +5185,7 @@ static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
 
 out_err:
        rbd_dev_unparent(rbd_dev);
-       if (parent)
-               rbd_dev_destroy(parent);
+       rbd_dev_destroy(parent);
        return ret;
 }
 
index 59c91d49b14b649f839f8af5951772436d959da8..ba4bfe933276e34310a4208a10eb2e364dfefa7e 100644 (file)
@@ -23,7 +23,7 @@
 #include <linux/workqueue.h>
 #include <linux/bitops.h>
 #include <linux/delay.h>
-#include <linux/time.h>
+#include <linux/ktime.h>
 #include <linux/hdreg.h>
 #include <linux/dma-mapping.h>
 #include <linux/completion.h>
@@ -671,16 +671,15 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func)
 static unsigned int carm_fill_sync_time(struct carm_host *host,
                                        unsigned int idx, void *mem)
 {
-       struct timeval tv;
        struct carm_msg_sync_time *st = mem;
 
-       do_gettimeofday(&tv);
+       time64_t tv = ktime_get_real_seconds();
 
        memset(st, 0, sizeof(*st));
        st->type        = CARM_MSG_MISC;
        st->subtype     = MISC_SET_TIME;
        st->handle      = cpu_to_le32(TAG_ENCODE(idx));
-       st->timestamp   = cpu_to_le32(tv.tv_sec);
+       st->timestamp   = cpu_to_le32(tv);
 
        return sizeof(struct carm_msg_sync_time);
 }
index 41fb1a917b172dc9446495d84357060847c2b569..4809c1501d7eda82081d922fa1944d8cd7c76c84 100644 (file)
@@ -83,6 +83,16 @@ module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644);
 MODULE_PARM_DESC(max_persistent_grants,
                  "Maximum number of grants to map persistently");
 
+/*
+ * Maximum number of rings/queues blkback supports, allow as many queues as there
+ * are CPUs if user has not specified a value.
+ */
+unsigned int xenblk_max_queues;
+module_param_named(max_queues, xenblk_max_queues, uint, 0644);
+MODULE_PARM_DESC(max_queues,
+                "Maximum number of hardware queues per virtual disk." \
+                "By default it is the number of online CPUs.");
+
 /*
  * Maximum order of pages to be used for the shared ring between front and
  * backend, 4KB page granularity is used.
@@ -113,71 +123,71 @@ module_param(log_stats, int, 0644);
 /* Number of free pages to remove on each call to gnttab_free_pages */
 #define NUM_BATCH_FREE_PAGES 10
 
-static inline int get_free_page(struct xen_blkif *blkif, struct page **page)
+static inline int get_free_page(struct xen_blkif_ring *ring, struct page **page)
 {
        unsigned long flags;
 
-       spin_lock_irqsave(&blkif->free_pages_lock, flags);
-       if (list_empty(&blkif->free_pages)) {
-               BUG_ON(blkif->free_pages_num != 0);
-               spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+       spin_lock_irqsave(&ring->free_pages_lock, flags);
+       if (list_empty(&ring->free_pages)) {
+               BUG_ON(ring->free_pages_num != 0);
+               spin_unlock_irqrestore(&ring->free_pages_lock, flags);
                return gnttab_alloc_pages(1, page);
        }
-       BUG_ON(blkif->free_pages_num == 0);
-       page[0] = list_first_entry(&blkif->free_pages, struct page, lru);
+       BUG_ON(ring->free_pages_num == 0);
+       page[0] = list_first_entry(&ring->free_pages, struct page, lru);
        list_del(&page[0]->lru);
-       blkif->free_pages_num--;
-       spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+       ring->free_pages_num--;
+       spin_unlock_irqrestore(&ring->free_pages_lock, flags);
 
        return 0;
 }
 
-static inline void put_free_pages(struct xen_blkif *blkif, struct page **page,
+static inline void put_free_pages(struct xen_blkif_ring *ring, struct page **page,
                                   int num)
 {
        unsigned long flags;
        int i;
 
-       spin_lock_irqsave(&blkif->free_pages_lock, flags);
+       spin_lock_irqsave(&ring->free_pages_lock, flags);
        for (i = 0; i < num; i++)
-               list_add(&page[i]->lru, &blkif->free_pages);
-       blkif->free_pages_num += num;
-       spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+               list_add(&page[i]->lru, &ring->free_pages);
+       ring->free_pages_num += num;
+       spin_unlock_irqrestore(&ring->free_pages_lock, flags);
 }
 
-static inline void shrink_free_pagepool(struct xen_blkif *blkif, int num)
+static inline void shrink_free_pagepool(struct xen_blkif_ring *ring, int num)
 {
        /* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */
        struct page *page[NUM_BATCH_FREE_PAGES];
        unsigned int num_pages = 0;
        unsigned long flags;
 
-       spin_lock_irqsave(&blkif->free_pages_lock, flags);
-       while (blkif->free_pages_num > num) {
-               BUG_ON(list_empty(&blkif->free_pages));
-               page[num_pages] = list_first_entry(&blkif->free_pages,
+       spin_lock_irqsave(&ring->free_pages_lock, flags);
+       while (ring->free_pages_num > num) {
+               BUG_ON(list_empty(&ring->free_pages));
+               page[num_pages] = list_first_entry(&ring->free_pages,
                                                   struct page, lru);
                list_del(&page[num_pages]->lru);
-               blkif->free_pages_num--;
+               ring->free_pages_num--;
                if (++num_pages == NUM_BATCH_FREE_PAGES) {
-                       spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+                       spin_unlock_irqrestore(&ring->free_pages_lock, flags);
                        gnttab_free_pages(num_pages, page);
-                       spin_lock_irqsave(&blkif->free_pages_lock, flags);
+                       spin_lock_irqsave(&ring->free_pages_lock, flags);
                        num_pages = 0;
                }
        }
-       spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+       spin_unlock_irqrestore(&ring->free_pages_lock, flags);
        if (num_pages != 0)
                gnttab_free_pages(num_pages, page);
 }
 
 #define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page)))
 
-static int do_block_io_op(struct xen_blkif *blkif);
-static int dispatch_rw_block_io(struct xen_blkif *blkif,
+static int do_block_io_op(struct xen_blkif_ring *ring);
+static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
                                struct blkif_request *req,
                                struct pending_req *pending_req);
-static void make_response(struct xen_blkif *blkif, u64 id,
+static void make_response(struct xen_blkif_ring *ring, u64 id,
                          unsigned short op, int st);
 
 #define foreach_grant_safe(pos, n, rbtree, node) \
@@ -190,7 +200,7 @@ static void make_response(struct xen_blkif *blkif, u64 id,
 
 /*
  * We don't need locking around the persistent grant helpers
- * because blkback uses a single-thread for each backed, so we
+ * because blkback uses a single-thread for each backend, so we
  * can be sure that this functions will never be called recursively.
  *
  * The only exception to that is put_persistent_grant, that can be called
@@ -198,19 +208,20 @@ static void make_response(struct xen_blkif *blkif, u64 id,
  * bit operations to modify the flags of a persistent grant and to count
  * the number of used grants.
  */
-static int add_persistent_gnt(struct xen_blkif *blkif,
+static int add_persistent_gnt(struct xen_blkif_ring *ring,
                               struct persistent_gnt *persistent_gnt)
 {
        struct rb_node **new = NULL, *parent = NULL;
        struct persistent_gnt *this;
+       struct xen_blkif *blkif = ring->blkif;
 
-       if (blkif->persistent_gnt_c >= xen_blkif_max_pgrants) {
+       if (ring->persistent_gnt_c >= xen_blkif_max_pgrants) {
                if (!blkif->vbd.overflow_max_grants)
                        blkif->vbd.overflow_max_grants = 1;
                return -EBUSY;
        }
        /* Figure out where to put new node */
-       new = &blkif->persistent_gnts.rb_node;
+       new = &ring->persistent_gnts.rb_node;
        while (*new) {
                this = container_of(*new, struct persistent_gnt, node);
 
@@ -229,19 +240,19 @@ static int add_persistent_gnt(struct xen_blkif *blkif,
        set_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
        /* Add new node and rebalance tree. */
        rb_link_node(&(persistent_gnt->node), parent, new);
-       rb_insert_color(&(persistent_gnt->node), &blkif->persistent_gnts);
-       blkif->persistent_gnt_c++;
-       atomic_inc(&blkif->persistent_gnt_in_use);
+       rb_insert_color(&(persistent_gnt->node), &ring->persistent_gnts);
+       ring->persistent_gnt_c++;
+       atomic_inc(&ring->persistent_gnt_in_use);
        return 0;
 }
 
-static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif,
+static struct persistent_gnt *get_persistent_gnt(struct xen_blkif_ring *ring,
                                                 grant_ref_t gref)
 {
        struct persistent_gnt *data;
        struct rb_node *node = NULL;
 
-       node = blkif->persistent_gnts.rb_node;
+       node = ring->persistent_gnts.rb_node;
        while (node) {
                data = container_of(node, struct persistent_gnt, node);
 
@@ -255,24 +266,24 @@ static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif,
                                return NULL;
                        }
                        set_bit(PERSISTENT_GNT_ACTIVE, data->flags);
-                       atomic_inc(&blkif->persistent_gnt_in_use);
+                       atomic_inc(&ring->persistent_gnt_in_use);
                        return data;
                }
        }
        return NULL;
 }
 
-static void put_persistent_gnt(struct xen_blkif *blkif,
+static void put_persistent_gnt(struct xen_blkif_ring *ring,
                                struct persistent_gnt *persistent_gnt)
 {
        if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags))
                pr_alert_ratelimited("freeing a grant already unused\n");
        set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags);
        clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
-       atomic_dec(&blkif->persistent_gnt_in_use);
+       atomic_dec(&ring->persistent_gnt_in_use);
 }
 
-static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root,
+static void free_persistent_gnts(struct xen_blkif_ring *ring, struct rb_root *root,
                                  unsigned int num)
 {
        struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
@@ -303,7 +314,7 @@ static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root,
                        unmap_data.count = segs_to_unmap;
                        BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
 
-                       put_free_pages(blkif, pages, segs_to_unmap);
+                       put_free_pages(ring, pages, segs_to_unmap);
                        segs_to_unmap = 0;
                }
 
@@ -320,15 +331,15 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work)
        struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
        struct persistent_gnt *persistent_gnt;
        int segs_to_unmap = 0;
-       struct xen_blkif *blkif = container_of(work, typeof(*blkif), persistent_purge_work);
+       struct xen_blkif_ring *ring = container_of(work, typeof(*ring), persistent_purge_work);
        struct gntab_unmap_queue_data unmap_data;
 
        unmap_data.pages = pages;
        unmap_data.unmap_ops = unmap;
        unmap_data.kunmap_ops = NULL;
 
-       while(!list_empty(&blkif->persistent_purge_list)) {
-               persistent_gnt = list_first_entry(&blkif->persistent_purge_list,
+       while(!list_empty(&ring->persistent_purge_list)) {
+               persistent_gnt = list_first_entry(&ring->persistent_purge_list,
                                                  struct persistent_gnt,
                                                  remove_node);
                list_del(&persistent_gnt->remove_node);
@@ -343,7 +354,7 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work)
                if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
                        unmap_data.count = segs_to_unmap;
                        BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
-                       put_free_pages(blkif, pages, segs_to_unmap);
+                       put_free_pages(ring, pages, segs_to_unmap);
                        segs_to_unmap = 0;
                }
                kfree(persistent_gnt);
@@ -351,11 +362,11 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work)
        if (segs_to_unmap > 0) {
                unmap_data.count = segs_to_unmap;
                BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
-               put_free_pages(blkif, pages, segs_to_unmap);
+               put_free_pages(ring, pages, segs_to_unmap);
        }
 }
 
-static void purge_persistent_gnt(struct xen_blkif *blkif)
+static void purge_persistent_gnt(struct xen_blkif_ring *ring)
 {
        struct persistent_gnt *persistent_gnt;
        struct rb_node *n;
@@ -363,23 +374,23 @@ static void purge_persistent_gnt(struct xen_blkif *blkif)
        bool scan_used = false, clean_used = false;
        struct rb_root *root;
 
-       if (blkif->persistent_gnt_c < xen_blkif_max_pgrants ||
-           (blkif->persistent_gnt_c == xen_blkif_max_pgrants &&
-           !blkif->vbd.overflow_max_grants)) {
-               return;
+       if (ring->persistent_gnt_c < xen_blkif_max_pgrants ||
+           (ring->persistent_gnt_c == xen_blkif_max_pgrants &&
+           !ring->blkif->vbd.overflow_max_grants)) {
+               goto out;
        }
 
-       if (work_busy(&blkif->persistent_purge_work)) {
+       if (work_busy(&ring->persistent_purge_work)) {
                pr_alert_ratelimited("Scheduled work from previous purge is still busy, cannot purge list\n");
-               return;
+               goto out;
        }
 
        num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN;
-       num_clean = blkif->persistent_gnt_c - xen_blkif_max_pgrants + num_clean;
-       num_clean = min(blkif->persistent_gnt_c, num_clean);
+       num_clean = ring->persistent_gnt_c - xen_blkif_max_pgrants + num_clean;
+       num_clean = min(ring->persistent_gnt_c, num_clean);
        if ((num_clean == 0) ||
-           (num_clean > (blkif->persistent_gnt_c - atomic_read(&blkif->persistent_gnt_in_use))))
-               return;
+           (num_clean > (ring->persistent_gnt_c - atomic_read(&ring->persistent_gnt_in_use))))
+               goto out;
 
        /*
         * At this point, we can assure that there will be no calls
@@ -394,8 +405,8 @@ static void purge_persistent_gnt(struct xen_blkif *blkif)
 
        pr_debug("Going to purge %u persistent grants\n", num_clean);
 
-       BUG_ON(!list_empty(&blkif->persistent_purge_list));
-       root = &blkif->persistent_gnts;
+       BUG_ON(!list_empty(&ring->persistent_purge_list));
+       root = &ring->persistent_gnts;
 purge_list:
        foreach_grant_safe(persistent_gnt, n, root, node) {
                BUG_ON(persistent_gnt->handle ==
@@ -414,7 +425,7 @@ purge_list:
 
                rb_erase(&persistent_gnt->node, root);
                list_add(&persistent_gnt->remove_node,
-                        &blkif->persistent_purge_list);
+                        &ring->persistent_purge_list);
                if (--num_clean == 0)
                        goto finished;
        }
@@ -435,30 +446,32 @@ finished:
                goto purge_list;
        }
 
-       blkif->persistent_gnt_c -= (total - num_clean);
-       blkif->vbd.overflow_max_grants = 0;
+       ring->persistent_gnt_c -= (total - num_clean);
+       ring->blkif->vbd.overflow_max_grants = 0;
 
        /* We can defer this work */
-       schedule_work(&blkif->persistent_purge_work);
+       schedule_work(&ring->persistent_purge_work);
        pr_debug("Purged %u/%u\n", (total - num_clean), total);
+
+out:
        return;
 }
 
 /*
  * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
  */
-static struct pending_req *alloc_req(struct xen_blkif *blkif)
+static struct pending_req *alloc_req(struct xen_blkif_ring *ring)
 {
        struct pending_req *req = NULL;
        unsigned long flags;
 
-       spin_lock_irqsave(&blkif->pending_free_lock, flags);
-       if (!list_empty(&blkif->pending_free)) {
-               req = list_entry(blkif->pending_free.next, struct pending_req,
+       spin_lock_irqsave(&ring->pending_free_lock, flags);
+       if (!list_empty(&ring->pending_free)) {
+               req = list_entry(ring->pending_free.next, struct pending_req,
                                 free_list);
                list_del(&req->free_list);
        }
-       spin_unlock_irqrestore(&blkif->pending_free_lock, flags);
+       spin_unlock_irqrestore(&ring->pending_free_lock, flags);
        return req;
 }
 
@@ -466,17 +479,17 @@ static struct pending_req *alloc_req(struct xen_blkif *blkif)
  * Return the 'pending_req' structure back to the freepool. We also
  * wake up the thread if it was waiting for a free page.
  */
-static void free_req(struct xen_blkif *blkif, struct pending_req *req)
+static void free_req(struct xen_blkif_ring *ring, struct pending_req *req)
 {
        unsigned long flags;
        int was_empty;
 
-       spin_lock_irqsave(&blkif->pending_free_lock, flags);
-       was_empty = list_empty(&blkif->pending_free);
-       list_add(&req->free_list, &blkif->pending_free);
-       spin_unlock_irqrestore(&blkif->pending_free_lock, flags);
+       spin_lock_irqsave(&ring->pending_free_lock, flags);
+       was_empty = list_empty(&ring->pending_free);
+       list_add(&req->free_list, &ring->pending_free);
+       spin_unlock_irqrestore(&ring->pending_free_lock, flags);
        if (was_empty)
-               wake_up(&blkif->pending_free_wq);
+               wake_up(&ring->pending_free_wq);
 }
 
 /*
@@ -556,10 +569,10 @@ abort:
 /*
  * Notification from the guest OS.
  */
-static void blkif_notify_work(struct xen_blkif *blkif)
+static void blkif_notify_work(struct xen_blkif_ring *ring)
 {
-       blkif->waiting_reqs = 1;
-       wake_up(&blkif->wq);
+       ring->waiting_reqs = 1;
+       wake_up(&ring->wq);
 }
 
 irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
@@ -572,31 +585,33 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
  * SCHEDULER FUNCTIONS
  */
 
-static void print_stats(struct xen_blkif *blkif)
+static void print_stats(struct xen_blkif_ring *ring)
 {
        pr_info("(%s): oo %3llu  |  rd %4llu  |  wr %4llu  |  f %4llu"
                 "  |  ds %4llu | pg: %4u/%4d\n",
-                current->comm, blkif->st_oo_req,
-                blkif->st_rd_req, blkif->st_wr_req,
-                blkif->st_f_req, blkif->st_ds_req,
-                blkif->persistent_gnt_c,
+                current->comm, ring->st_oo_req,
+                ring->st_rd_req, ring->st_wr_req,
+                ring->st_f_req, ring->st_ds_req,
+                ring->persistent_gnt_c,
                 xen_blkif_max_pgrants);
-       blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
-       blkif->st_rd_req = 0;
-       blkif->st_wr_req = 0;
-       blkif->st_oo_req = 0;
-       blkif->st_ds_req = 0;
+       ring->st_print = jiffies + msecs_to_jiffies(10 * 1000);
+       ring->st_rd_req = 0;
+       ring->st_wr_req = 0;
+       ring->st_oo_req = 0;
+       ring->st_ds_req = 0;
 }
 
 int xen_blkif_schedule(void *arg)
 {
-       struct xen_blkif *blkif = arg;
+       struct xen_blkif_ring *ring = arg;
+       struct xen_blkif *blkif = ring->blkif;
        struct xen_vbd *vbd = &blkif->vbd;
        unsigned long timeout;
        int ret;
 
        xen_blkif_get(blkif);
 
+       set_freezable();
        while (!kthread_should_stop()) {
                if (try_to_freeze())
                        continue;
@@ -606,50 +621,50 @@ int xen_blkif_schedule(void *arg)
                timeout = msecs_to_jiffies(LRU_INTERVAL);
 
                timeout = wait_event_interruptible_timeout(
-                       blkif->wq,
-                       blkif->waiting_reqs || kthread_should_stop(),
+                       ring->wq,
+                       ring->waiting_reqs || kthread_should_stop(),
                        timeout);
                if (timeout == 0)
                        goto purge_gnt_list;
                timeout = wait_event_interruptible_timeout(
-                       blkif->pending_free_wq,
-                       !list_empty(&blkif->pending_free) ||
+                       ring->pending_free_wq,
+                       !list_empty(&ring->pending_free) ||
                        kthread_should_stop(),
                        timeout);
                if (timeout == 0)
                        goto purge_gnt_list;
 
-               blkif->waiting_reqs = 0;
+               ring->waiting_reqs = 0;
                smp_mb(); /* clear flag *before* checking for work */
 
-               ret = do_block_io_op(blkif);
+               ret = do_block_io_op(ring);
                if (ret > 0)
-                       blkif->waiting_reqs = 1;
+                       ring->waiting_reqs = 1;
                if (ret == -EACCES)
-                       wait_event_interruptible(blkif->shutdown_wq,
+                       wait_event_interruptible(ring->shutdown_wq,
                                                 kthread_should_stop());
 
 purge_gnt_list:
                if (blkif->vbd.feature_gnt_persistent &&
-                   time_after(jiffies, blkif->next_lru)) {
-                       purge_persistent_gnt(blkif);
-                       blkif->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL);
+                   time_after(jiffies, ring->next_lru)) {
+                       purge_persistent_gnt(ring);
+                       ring->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL);
                }
 
                /* Shrink if we have more than xen_blkif_max_buffer_pages */
-               shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages);
+               shrink_free_pagepool(ring, xen_blkif_max_buffer_pages);
 
-               if (log_stats && time_after(jiffies, blkif->st_print))
-                       print_stats(blkif);
+               if (log_stats && time_after(jiffies, ring->st_print))
+                       print_stats(ring);
        }
 
        /* Drain pending purge work */
-       flush_work(&blkif->persistent_purge_work);
+       flush_work(&ring->persistent_purge_work);
 
        if (log_stats)
-               print_stats(blkif);
+               print_stats(ring);
 
-       blkif->xenblkd = NULL;
+       ring->xenblkd = NULL;
        xen_blkif_put(blkif);
 
        return 0;
@@ -658,22 +673,22 @@ purge_gnt_list:
 /*
  * Remove persistent grants and empty the pool of free pages
  */
-void xen_blkbk_free_caches(struct xen_blkif *blkif)
+void xen_blkbk_free_caches(struct xen_blkif_ring *ring)
 {
        /* Free all persistent grant pages */
-       if (!RB_EMPTY_ROOT(&blkif->persistent_gnts))
-               free_persistent_gnts(blkif, &blkif->persistent_gnts,
-                       blkif->persistent_gnt_c);
+       if (!RB_EMPTY_ROOT(&ring->persistent_gnts))
+               free_persistent_gnts(ring, &ring->persistent_gnts,
+                       ring->persistent_gnt_c);
 
-       BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
-       blkif->persistent_gnt_c = 0;
+       BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts));
+       ring->persistent_gnt_c = 0;
 
        /* Since we are shutting down remove all pages from the buffer */
-       shrink_free_pagepool(blkif, 0 /* All */);
+       shrink_free_pagepool(ring, 0 /* All */);
 }
 
 static unsigned int xen_blkbk_unmap_prepare(
-       struct xen_blkif *blkif,
+       struct xen_blkif_ring *ring,
        struct grant_page **pages,
        unsigned int num,
        struct gnttab_unmap_grant_ref *unmap_ops,
@@ -683,7 +698,7 @@ static unsigned int xen_blkbk_unmap_prepare(
 
        for (i = 0; i < num; i++) {
                if (pages[i]->persistent_gnt != NULL) {
-                       put_persistent_gnt(blkif, pages[i]->persistent_gnt);
+                       put_persistent_gnt(ring, pages[i]->persistent_gnt);
                        continue;
                }
                if (pages[i]->handle == BLKBACK_INVALID_HANDLE)
@@ -700,17 +715,18 @@ static unsigned int xen_blkbk_unmap_prepare(
 
 static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_queue_data *data)
 {
-       struct pending_req* pending_req = (struct pending_req*) (data->data);
-       struct xen_blkif *blkif = pending_req->blkif;
+       struct pending_req *pending_req = (struct pending_req *)(data->data);
+       struct xen_blkif_ring *ring = pending_req->ring;
+       struct xen_blkif *blkif = ring->blkif;
 
        /* BUG_ON used to reproduce existing behaviour,
           but is this the best way to deal with this? */
        BUG_ON(result);
 
-       put_free_pages(blkif, data->pages, data->count);
-       make_response(blkif, pending_req->id,
+       put_free_pages(ring, data->pages, data->count);
+       make_response(ring, pending_req->id,
                      pending_req->operation, pending_req->status);
-       free_req(blkif, pending_req);
+       free_req(ring, pending_req);
        /*
         * Make sure the request is freed before releasing blkif,
         * or there could be a race between free_req and the
@@ -723,7 +739,7 @@ static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_
         * pending_free_wq if there's a drain going on, but it has
         * to be taken into account if the current model is changed.
         */
-       if (atomic_dec_and_test(&blkif->inflight) && atomic_read(&blkif->drain)) {
+       if (atomic_dec_and_test(&ring->inflight) && atomic_read(&blkif->drain)) {
                complete(&blkif->drain_complete);
        }
        xen_blkif_put(blkif);
@@ -732,11 +748,11 @@ static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_
 static void xen_blkbk_unmap_and_respond(struct pending_req *req)
 {
        struct gntab_unmap_queue_data* work = &req->gnttab_unmap_data;
-       struct xen_blkif *blkif = req->blkif;
+       struct xen_blkif_ring *ring = req->ring;
        struct grant_page **pages = req->segments;
        unsigned int invcount;
 
-       invcount = xen_blkbk_unmap_prepare(blkif, pages, req->nr_segs,
+       invcount = xen_blkbk_unmap_prepare(ring, pages, req->nr_segs,
                                           req->unmap, req->unmap_pages);
 
        work->data = req;
@@ -757,7 +773,7 @@ static void xen_blkbk_unmap_and_respond(struct pending_req *req)
  * of hypercalls, but since this is only used in error paths there's
  * no real need.
  */
-static void xen_blkbk_unmap(struct xen_blkif *blkif,
+static void xen_blkbk_unmap(struct xen_blkif_ring *ring,
                             struct grant_page *pages[],
                             int num)
 {
@@ -768,20 +784,20 @@ static void xen_blkbk_unmap(struct xen_blkif *blkif,
 
        while (num) {
                unsigned int batch = min(num, BLKIF_MAX_SEGMENTS_PER_REQUEST);
-               
-               invcount = xen_blkbk_unmap_prepare(blkif, pages, batch,
+
+               invcount = xen_blkbk_unmap_prepare(ring, pages, batch,
                                                   unmap, unmap_pages);
                if (invcount) {
                        ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount);
                        BUG_ON(ret);
-                       put_free_pages(blkif, unmap_pages, invcount);
+                       put_free_pages(ring, unmap_pages, invcount);
                }
                pages += batch;
                num -= batch;
        }
 }
 
-static int xen_blkbk_map(struct xen_blkif *blkif,
+static int xen_blkbk_map(struct xen_blkif_ring *ring,
                         struct grant_page *pages[],
                         int num, bool ro)
 {
@@ -794,6 +810,7 @@ static int xen_blkbk_map(struct xen_blkif *blkif,
        int ret = 0;
        int last_map = 0, map_until = 0;
        int use_persistent_gnts;
+       struct xen_blkif *blkif = ring->blkif;
 
        use_persistent_gnts = (blkif->vbd.feature_gnt_persistent);
 
@@ -806,10 +823,11 @@ again:
        for (i = map_until; i < num; i++) {
                uint32_t flags;
 
-               if (use_persistent_gnts)
+               if (use_persistent_gnts) {
                        persistent_gnt = get_persistent_gnt(
-                               blkif,
+                               ring,
                                pages[i]->gref);
+               }
 
                if (persistent_gnt) {
                        /*
@@ -819,7 +837,7 @@ again:
                        pages[i]->page = persistent_gnt->page;
                        pages[i]->persistent_gnt = persistent_gnt;
                } else {
-                       if (get_free_page(blkif, &pages[i]->page))
+                       if (get_free_page(ring, &pages[i]->page))
                                goto out_of_memory;
                        addr = vaddr(pages[i]->page);
                        pages_to_gnt[segs_to_map] = pages[i]->page;
@@ -852,7 +870,7 @@ again:
                        BUG_ON(new_map_idx >= segs_to_map);
                        if (unlikely(map[new_map_idx].status != 0)) {
                                pr_debug("invalid buffer -- could not remap it\n");
-                               put_free_pages(blkif, &pages[seg_idx]->page, 1);
+                               put_free_pages(ring, &pages[seg_idx]->page, 1);
                                pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE;
                                ret |= 1;
                                goto next;
@@ -862,7 +880,7 @@ again:
                        continue;
                }
                if (use_persistent_gnts &&
-                   blkif->persistent_gnt_c < xen_blkif_max_pgrants) {
+                   ring->persistent_gnt_c < xen_blkif_max_pgrants) {
                        /*
                         * We are using persistent grants, the grant is
                         * not mapped but we might have room for it.
@@ -880,7 +898,7 @@ again:
                        persistent_gnt->gnt = map[new_map_idx].ref;
                        persistent_gnt->handle = map[new_map_idx].handle;
                        persistent_gnt->page = pages[seg_idx]->page;
-                       if (add_persistent_gnt(blkif,
+                       if (add_persistent_gnt(ring,
                                               persistent_gnt)) {
                                kfree(persistent_gnt);
                                persistent_gnt = NULL;
@@ -888,7 +906,7 @@ again:
                        }
                        pages[seg_idx]->persistent_gnt = persistent_gnt;
                        pr_debug("grant %u added to the tree of persistent grants, using %u/%u\n",
-                                persistent_gnt->gnt, blkif->persistent_gnt_c,
+                                persistent_gnt->gnt, ring->persistent_gnt_c,
                                 xen_blkif_max_pgrants);
                        goto next;
                }
@@ -913,7 +931,7 @@ next:
 
 out_of_memory:
        pr_alert("%s: out of memory\n", __func__);
-       put_free_pages(blkif, pages_to_gnt, segs_to_map);
+       put_free_pages(ring, pages_to_gnt, segs_to_map);
        return -ENOMEM;
 }
 
@@ -921,7 +939,7 @@ static int xen_blkbk_map_seg(struct pending_req *pending_req)
 {
        int rc;
 
-       rc = xen_blkbk_map(pending_req->blkif, pending_req->segments,
+       rc = xen_blkbk_map(pending_req->ring, pending_req->segments,
                           pending_req->nr_segs,
                           (pending_req->operation != BLKIF_OP_READ));
 
@@ -934,7 +952,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req,
                                    struct phys_req *preq)
 {
        struct grant_page **pages = pending_req->indirect_pages;
-       struct xen_blkif *blkif = pending_req->blkif;
+       struct xen_blkif_ring *ring = pending_req->ring;
        int indirect_grefs, rc, n, nseg, i;
        struct blkif_request_segment *segments = NULL;
 
@@ -945,7 +963,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req,
        for (i = 0; i < indirect_grefs; i++)
                pages[i]->gref = req->u.indirect.indirect_grefs[i];
 
-       rc = xen_blkbk_map(blkif, pages, indirect_grefs, true);
+       rc = xen_blkbk_map(ring, pages, indirect_grefs, true);
        if (rc)
                goto unmap;
 
@@ -977,15 +995,16 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req,
 unmap:
        if (segments)
                kunmap_atomic(segments);
-       xen_blkbk_unmap(blkif, pages, indirect_grefs);
+       xen_blkbk_unmap(ring, pages, indirect_grefs);
        return rc;
 }
 
-static int dispatch_discard_io(struct xen_blkif *blkif,
+static int dispatch_discard_io(struct xen_blkif_ring *ring,
                                struct blkif_request *req)
 {
        int err = 0;
        int status = BLKIF_RSP_OKAY;
+       struct xen_blkif *blkif = ring->blkif;
        struct block_device *bdev = blkif->vbd.bdev;
        unsigned long secure;
        struct phys_req preq;
@@ -1002,7 +1021,7 @@ static int dispatch_discard_io(struct xen_blkif *blkif,
                        preq.sector_number + preq.nr_sects, blkif->vbd.pdevice);
                goto fail_response;
        }
-       blkif->st_ds_req++;
+       ring->st_ds_req++;
 
        secure = (blkif->vbd.discard_secure &&
                 (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ?
@@ -1018,26 +1037,28 @@ fail_response:
        } else if (err)
                status = BLKIF_RSP_ERROR;
 
-       make_response(blkif, req->u.discard.id, req->operation, status);
+       make_response(ring, req->u.discard.id, req->operation, status);
        xen_blkif_put(blkif);
        return err;
 }
 
-static int dispatch_other_io(struct xen_blkif *blkif,
+static int dispatch_other_io(struct xen_blkif_ring *ring,
                             struct blkif_request *req,
                             struct pending_req *pending_req)
 {
-       free_req(blkif, pending_req);
-       make_response(blkif, req->u.other.id, req->operation,
+       free_req(ring, pending_req);
+       make_response(ring, req->u.other.id, req->operation,
                      BLKIF_RSP_EOPNOTSUPP);
        return -EIO;
 }
 
-static void xen_blk_drain_io(struct xen_blkif *blkif)
+static void xen_blk_drain_io(struct xen_blkif_ring *ring)
 {
+       struct xen_blkif *blkif = ring->blkif;
+
        atomic_set(&blkif->drain, 1);
        do {
-               if (atomic_read(&blkif->inflight) == 0)
+               if (atomic_read(&ring->inflight) == 0)
                        break;
                wait_for_completion_interruptible_timeout(
                                &blkif->drain_complete, HZ);
@@ -1058,12 +1079,12 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
        if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) &&
            (error == -EOPNOTSUPP)) {
                pr_debug("flush diskcache op failed, not supported\n");
-               xen_blkbk_flush_diskcache(XBT_NIL, pending_req->blkif->be, 0);
+               xen_blkbk_flush_diskcache(XBT_NIL, pending_req->ring->blkif->be, 0);
                pending_req->status = BLKIF_RSP_EOPNOTSUPP;
        } else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
                    (error == -EOPNOTSUPP)) {
                pr_debug("write barrier op failed, not supported\n");
-               xen_blkbk_barrier(XBT_NIL, pending_req->blkif->be, 0);
+               xen_blkbk_barrier(XBT_NIL, pending_req->ring->blkif->be, 0);
                pending_req->status = BLKIF_RSP_EOPNOTSUPP;
        } else if (error) {
                pr_debug("Buffer not up-to-date at end of operation,"
@@ -1097,9 +1118,9 @@ static void end_block_io_op(struct bio *bio)
  * and transmute  it to the block API to hand it over to the proper block disk.
  */
 static int
-__do_block_io_op(struct xen_blkif *blkif)
+__do_block_io_op(struct xen_blkif_ring *ring)
 {
-       union blkif_back_rings *blk_rings = &blkif->blk_rings;
+       union blkif_back_rings *blk_rings = &ring->blk_rings;
        struct blkif_request req;
        struct pending_req *pending_req;
        RING_IDX rc, rp;
@@ -1112,7 +1133,7 @@ __do_block_io_op(struct xen_blkif *blkif)
        if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) {
                rc = blk_rings->common.rsp_prod_pvt;
                pr_warn("Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n",
-                       rp, rc, rp - rc, blkif->vbd.pdevice);
+                       rp, rc, rp - rc, ring->blkif->vbd.pdevice);
                return -EACCES;
        }
        while (rc != rp) {
@@ -1125,14 +1146,14 @@ __do_block_io_op(struct xen_blkif *blkif)
                        break;
                }
 
-               pending_req = alloc_req(blkif);
+               pending_req = alloc_req(ring);
                if (NULL == pending_req) {
-                       blkif->st_oo_req++;
+                       ring->st_oo_req++;
                        more_to_do = 1;
                        break;
                }
 
-               switch (blkif->blk_protocol) {
+               switch (ring->blkif->blk_protocol) {
                case BLKIF_PROTOCOL_NATIVE:
                        memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
                        break;
@@ -1156,16 +1177,16 @@ __do_block_io_op(struct xen_blkif *blkif)
                case BLKIF_OP_WRITE_BARRIER:
                case BLKIF_OP_FLUSH_DISKCACHE:
                case BLKIF_OP_INDIRECT:
-                       if (dispatch_rw_block_io(blkif, &req, pending_req))
+                       if (dispatch_rw_block_io(ring, &req, pending_req))
                                goto done;
                        break;
                case BLKIF_OP_DISCARD:
-                       free_req(blkif, pending_req);
-                       if (dispatch_discard_io(blkif, &req))
+                       free_req(ring, pending_req);
+                       if (dispatch_discard_io(ring, &req))
                                goto done;
                        break;
                default:
-                       if (dispatch_other_io(blkif, &req, pending_req))
+                       if (dispatch_other_io(ring, &req, pending_req))
                                goto done;
                        break;
                }
@@ -1178,13 +1199,13 @@ done:
 }
 
 static int
-do_block_io_op(struct xen_blkif *blkif)
+do_block_io_op(struct xen_blkif_ring *ring)
 {
-       union blkif_back_rings *blk_rings = &blkif->blk_rings;
+       union blkif_back_rings *blk_rings = &ring->blk_rings;
        int more_to_do;
 
        do {
-               more_to_do = __do_block_io_op(blkif);
+               more_to_do = __do_block_io_op(ring);
                if (more_to_do)
                        break;
 
@@ -1197,7 +1218,7 @@ do_block_io_op(struct xen_blkif *blkif)
  * Transmutation of the 'struct blkif_request' to a proper 'struct bio'
  * and call the 'submit_bio' to pass it to the underlying storage.
  */
-static int dispatch_rw_block_io(struct xen_blkif *blkif,
+static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
                                struct blkif_request *req,
                                struct pending_req *pending_req)
 {
@@ -1225,17 +1246,17 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 
        switch (req_operation) {
        case BLKIF_OP_READ:
-               blkif->st_rd_req++;
+               ring->st_rd_req++;
                operation = READ;
                break;
        case BLKIF_OP_WRITE:
-               blkif->st_wr_req++;
+               ring->st_wr_req++;
                operation = WRITE_ODIRECT;
                break;
        case BLKIF_OP_WRITE_BARRIER:
                drain = true;
        case BLKIF_OP_FLUSH_DISKCACHE:
-               blkif->st_f_req++;
+               ring->st_f_req++;
                operation = WRITE_FLUSH;
                break;
        default:
@@ -1260,7 +1281,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 
        preq.nr_sects      = 0;
 
-       pending_req->blkif     = blkif;
+       pending_req->ring      = ring;
        pending_req->id        = req->u.rw.id;
        pending_req->operation = req_operation;
        pending_req->status    = BLKIF_RSP_OKAY;
@@ -1287,12 +1308,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
                        goto fail_response;
        }
 
-       if (xen_vbd_translate(&preq, blkif, operation) != 0) {
+       if (xen_vbd_translate(&preq, ring->blkif, operation) != 0) {
                pr_debug("access denied: %s of [%llu,%llu] on dev=%04x\n",
                         operation == READ ? "read" : "write",
                         preq.sector_number,
                         preq.sector_number + preq.nr_sects,
-                        blkif->vbd.pdevice);
+                        ring->blkif->vbd.pdevice);
                goto fail_response;
        }
 
@@ -1304,7 +1325,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
                if (((int)preq.sector_number|(int)seg[i].nsec) &
                    ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
                        pr_debug("Misaligned I/O request from domain %d\n",
-                                blkif->domid);
+                                ring->blkif->domid);
                        goto fail_response;
                }
        }
@@ -1313,7 +1334,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
         * issue the WRITE_FLUSH.
         */
        if (drain)
-               xen_blk_drain_io(pending_req->blkif);
+               xen_blk_drain_io(pending_req->ring);
 
        /*
         * If we have failed at this point, we need to undo the M2P override,
@@ -1328,8 +1349,8 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
         * This corresponding xen_blkif_put is done in __end_block_io_op, or
         * below (in "!bio") if we are handling a BLKIF_OP_DISCARD.
         */
-       xen_blkif_get(blkif);
-       atomic_inc(&blkif->inflight);
+       xen_blkif_get(ring->blkif);
+       atomic_inc(&ring->inflight);
 
        for (i = 0; i < nseg; i++) {
                while ((bio == NULL) ||
@@ -1377,19 +1398,19 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
        blk_finish_plug(&plug);
 
        if (operation == READ)
-               blkif->st_rd_sect += preq.nr_sects;
+               ring->st_rd_sect += preq.nr_sects;
        else if (operation & WRITE)
-               blkif->st_wr_sect += preq.nr_sects;
+               ring->st_wr_sect += preq.nr_sects;
 
        return 0;
 
  fail_flush:
-       xen_blkbk_unmap(blkif, pending_req->segments,
+       xen_blkbk_unmap(ring, pending_req->segments,
                        pending_req->nr_segs);
  fail_response:
        /* Haven't submitted any bio's yet. */
-       make_response(blkif, req->u.rw.id, req_operation, BLKIF_RSP_ERROR);
-       free_req(blkif, pending_req);
+       make_response(ring, req->u.rw.id, req_operation, BLKIF_RSP_ERROR);
+       free_req(ring, pending_req);
        msleep(1); /* back off a bit */
        return -EIO;
 
@@ -1407,21 +1428,22 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 /*
  * Put a response on the ring on how the operation fared.
  */
-static void make_response(struct xen_blkif *blkif, u64 id,
+static void make_response(struct xen_blkif_ring *ring, u64 id,
                          unsigned short op, int st)
 {
        struct blkif_response  resp;
        unsigned long     flags;
-       union blkif_back_rings *blk_rings = &blkif->blk_rings;
+       union blkif_back_rings *blk_rings;
        int notify;
 
        resp.id        = id;
        resp.operation = op;
        resp.status    = st;
 
-       spin_lock_irqsave(&blkif->blk_ring_lock, flags);
+       spin_lock_irqsave(&ring->blk_ring_lock, flags);
+       blk_rings = &ring->blk_rings;
        /* Place on the response ring for the relevant domain. */
-       switch (blkif->blk_protocol) {
+       switch (ring->blkif->blk_protocol) {
        case BLKIF_PROTOCOL_NATIVE:
                memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
                       &resp, sizeof(resp));
@@ -1439,9 +1461,9 @@ static void make_response(struct xen_blkif *blkif, u64 id,
        }
        blk_rings->common.rsp_prod_pvt++;
        RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
-       spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
+       spin_unlock_irqrestore(&ring->blk_ring_lock, flags);
        if (notify)
-               notify_remote_via_irq(blkif->irq);
+               notify_remote_via_irq(ring->irq);
 }
 
 static int __init xen_blkif_init(void)
@@ -1457,6 +1479,9 @@ static int __init xen_blkif_init(void)
                xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
        }
 
+       if (xenblk_max_queues == 0)
+               xenblk_max_queues = num_online_cpus();
+
        rc = xen_blkif_interface_init();
        if (rc)
                goto failed_init;
index c929ae22764c9dd4345195a1542df997c5a8e29b..dea61f6ab8cbdbaffedceb4c64bda239b51a63a4 100644 (file)
@@ -46,6 +46,7 @@
 #include <xen/interface/io/protocols.h>
 
 extern unsigned int xen_blkif_max_ring_order;
+extern unsigned int xenblk_max_queues;
 /*
  * This is the maximum number of segments that would be allowed in indirect
  * requests. This value will also be passed to the frontend.
@@ -269,68 +270,79 @@ struct persistent_gnt {
        struct list_head remove_node;
 };
 
-struct xen_blkif {
-       /* Unique identifier for this interface. */
-       domid_t                 domid;
-       unsigned int            handle;
+/* Per-ring information. */
+struct xen_blkif_ring {
        /* Physical parameters of the comms window. */
        unsigned int            irq;
-       /* Comms information. */
-       enum blkif_protocol     blk_protocol;
        union blkif_back_rings  blk_rings;
        void                    *blk_ring;
-       /* The VBD attached to this interface. */
-       struct xen_vbd          vbd;
-       /* Back pointer to the backend_info. */
-       struct backend_info     *be;
        /* Private fields. */
        spinlock_t              blk_ring_lock;
-       atomic_t                refcnt;
 
        wait_queue_head_t       wq;
-       /* for barrier (drain) requests */
-       struct completion       drain_complete;
-       atomic_t                drain;
        atomic_t                inflight;
-       /* One thread per one blkif. */
+       /* One thread per blkif ring. */
        struct task_struct      *xenblkd;
        unsigned int            waiting_reqs;
 
-       /* tree to store persistent grants */
+       /* List of all 'pending_req' available */
+       struct list_head        pending_free;
+       /* And its spinlock. */
+       spinlock_t              pending_free_lock;
+       wait_queue_head_t       pending_free_wq;
+
+       /* Tree to store persistent grants. */
+       spinlock_t              pers_gnts_lock;
        struct rb_root          persistent_gnts;
        unsigned int            persistent_gnt_c;
        atomic_t                persistent_gnt_in_use;
        unsigned long           next_lru;
 
-       /* used by the kworker that offload work from the persistent purge */
+       /* Statistics. */
+       unsigned long           st_print;
+       unsigned long long      st_rd_req;
+       unsigned long long      st_wr_req;
+       unsigned long long      st_oo_req;
+       unsigned long long      st_f_req;
+       unsigned long long      st_ds_req;
+       unsigned long long      st_rd_sect;
+       unsigned long long      st_wr_sect;
+
+       /* Used by the kworker that offload work from the persistent purge. */
        struct list_head        persistent_purge_list;
        struct work_struct      persistent_purge_work;
 
-       /* buffer of free pages to map grant refs */
+       /* Buffer of free pages to map grant refs. */
        spinlock_t              free_pages_lock;
        int                     free_pages_num;
        struct list_head        free_pages;
 
-       /* List of all 'pending_req' available */
-       struct list_head        pending_free;
-       /* And its spinlock. */
-       spinlock_t              pending_free_lock;
-       wait_queue_head_t       pending_free_wq;
-
-       /* statistics */
-       unsigned long           st_print;
-       unsigned long long                      st_rd_req;
-       unsigned long long                      st_wr_req;
-       unsigned long long                      st_oo_req;
-       unsigned long long                      st_f_req;
-       unsigned long long                      st_ds_req;
-       unsigned long long                      st_rd_sect;
-       unsigned long long                      st_wr_sect;
-
        struct work_struct      free_work;
        /* Thread shutdown wait queue. */
        wait_queue_head_t       shutdown_wq;
-       unsigned int nr_ring_pages;
+       struct xen_blkif        *blkif;
+};
+
+struct xen_blkif {
+       /* Unique identifier for this interface. */
+       domid_t                 domid;
+       unsigned int            handle;
+       /* Comms information. */
+       enum blkif_protocol     blk_protocol;
+       /* The VBD attached to this interface. */
+       struct xen_vbd          vbd;
+       /* Back pointer to the backend_info. */
+       struct backend_info     *be;
+       atomic_t                refcnt;
+       /* for barrier (drain) requests */
+       struct completion       drain_complete;
+       atomic_t                drain;
+
+       struct work_struct      free_work;
+       unsigned int            nr_ring_pages;
+       /* All rings for this device. */
+       struct xen_blkif_ring   *rings;
+       unsigned int            nr_rings;
 };
 
 struct seg_buf {
@@ -352,7 +364,7 @@ struct grant_page {
  * response queued for it, with the saved 'id' passed back.
  */
 struct pending_req {
-       struct xen_blkif        *blkif;
+       struct xen_blkif_ring   *ring;
        u64                     id;
        int                     nr_segs;
        atomic_t                pendcnt;
@@ -394,7 +406,7 @@ int xen_blkif_xenbus_init(void);
 irqreturn_t xen_blkif_be_int(int irq, void *dev_id);
 int xen_blkif_schedule(void *arg);
 int xen_blkif_purge_persistent(void *arg);
-void xen_blkbk_free_caches(struct xen_blkif *blkif);
+void xen_blkbk_free_caches(struct xen_blkif_ring *ring);
 
 int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
                              struct backend_info *be, int state);
index f53cff42f8dab8891143ff2d6a3626273857eec6..876763f7f13e1a77f8f64738de967c1f21815440 100644 (file)
@@ -86,9 +86,11 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
 {
        int err;
        char name[BLKBACK_NAME_LEN];
+       struct xen_blkif_ring *ring;
+       int i;
 
        /* Not ready to connect? */
-       if (!blkif->irq || !blkif->vbd.bdev)
+       if (!blkif->rings || !blkif->rings[0].irq || !blkif->vbd.bdev)
                return;
 
        /* Already connected? */
@@ -113,13 +115,55 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
        }
        invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping);
 
-       blkif->xenblkd = kthread_run(xen_blkif_schedule, blkif, "%s", name);
-       if (IS_ERR(blkif->xenblkd)) {
-               err = PTR_ERR(blkif->xenblkd);
-               blkif->xenblkd = NULL;
-               xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
-               return;
+       for (i = 0; i < blkif->nr_rings; i++) {
+               ring = &blkif->rings[i];
+               ring->xenblkd = kthread_run(xen_blkif_schedule, ring, "%s-%d", name, i);
+               if (IS_ERR(ring->xenblkd)) {
+                       err = PTR_ERR(ring->xenblkd);
+                       ring->xenblkd = NULL;
+                       xenbus_dev_fatal(blkif->be->dev, err,
+                                       "start %s-%d xenblkd", name, i);
+                       goto out;
+               }
+       }
+       return;
+
+out:
+       while (--i >= 0) {
+               ring = &blkif->rings[i];
+               kthread_stop(ring->xenblkd);
+       }
+       return;
+}
+
+static int xen_blkif_alloc_rings(struct xen_blkif *blkif)
+{
+       unsigned int r;
+
+       blkif->rings = kzalloc(blkif->nr_rings * sizeof(struct xen_blkif_ring), GFP_KERNEL);
+       if (!blkif->rings)
+               return -ENOMEM;
+
+       for (r = 0; r < blkif->nr_rings; r++) {
+               struct xen_blkif_ring *ring = &blkif->rings[r];
+
+               spin_lock_init(&ring->blk_ring_lock);
+               init_waitqueue_head(&ring->wq);
+               INIT_LIST_HEAD(&ring->pending_free);
+               INIT_LIST_HEAD(&ring->persistent_purge_list);
+               INIT_WORK(&ring->persistent_purge_work, xen_blkbk_unmap_purged_grants);
+               spin_lock_init(&ring->free_pages_lock);
+               INIT_LIST_HEAD(&ring->free_pages);
+
+               spin_lock_init(&ring->pending_free_lock);
+               init_waitqueue_head(&ring->pending_free_wq);
+               init_waitqueue_head(&ring->shutdown_wq);
+               ring->blkif = blkif;
+               ring->st_print = jiffies;
+               xen_blkif_get(blkif);
        }
+
+       return 0;
 }
 
 static struct xen_blkif *xen_blkif_alloc(domid_t domid)
@@ -133,41 +177,25 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
                return ERR_PTR(-ENOMEM);
 
        blkif->domid = domid;
-       spin_lock_init(&blkif->blk_ring_lock);
        atomic_set(&blkif->refcnt, 1);
-       init_waitqueue_head(&blkif->wq);
        init_completion(&blkif->drain_complete);
-       atomic_set(&blkif->drain, 0);
-       blkif->st_print = jiffies;
-       blkif->persistent_gnts.rb_node = NULL;
-       spin_lock_init(&blkif->free_pages_lock);
-       INIT_LIST_HEAD(&blkif->free_pages);
-       INIT_LIST_HEAD(&blkif->persistent_purge_list);
-       blkif->free_pages_num = 0;
-       atomic_set(&blkif->persistent_gnt_in_use, 0);
-       atomic_set(&blkif->inflight, 0);
-       INIT_WORK(&blkif->persistent_purge_work, xen_blkbk_unmap_purged_grants);
-
-       INIT_LIST_HEAD(&blkif->pending_free);
        INIT_WORK(&blkif->free_work, xen_blkif_deferred_free);
-       spin_lock_init(&blkif->pending_free_lock);
-       init_waitqueue_head(&blkif->pending_free_wq);
-       init_waitqueue_head(&blkif->shutdown_wq);
 
        return blkif;
 }
 
-static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
+static int xen_blkif_map(struct xen_blkif_ring *ring, grant_ref_t *gref,
                         unsigned int nr_grefs, unsigned int evtchn)
 {
        int err;
+       struct xen_blkif *blkif = ring->blkif;
 
        /* Already connected through? */
-       if (blkif->irq)
+       if (ring->irq)
                return 0;
 
        err = xenbus_map_ring_valloc(blkif->be->dev, gref, nr_grefs,
-                                    &blkif->blk_ring);
+                                    &ring->blk_ring);
        if (err < 0)
                return err;
 
@@ -175,24 +203,24 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
        case BLKIF_PROTOCOL_NATIVE:
        {
                struct blkif_sring *sring;
-               sring = (struct blkif_sring *)blkif->blk_ring;
-               BACK_RING_INIT(&blkif->blk_rings.native, sring,
+               sring = (struct blkif_sring *)ring->blk_ring;
+               BACK_RING_INIT(&ring->blk_rings.native, sring,
                               XEN_PAGE_SIZE * nr_grefs);
                break;
        }
        case BLKIF_PROTOCOL_X86_32:
        {
                struct blkif_x86_32_sring *sring_x86_32;
-               sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring;
-               BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32,
+               sring_x86_32 = (struct blkif_x86_32_sring *)ring->blk_ring;
+               BACK_RING_INIT(&ring->blk_rings.x86_32, sring_x86_32,
                               XEN_PAGE_SIZE * nr_grefs);
                break;
        }
        case BLKIF_PROTOCOL_X86_64:
        {
                struct blkif_x86_64_sring *sring_x86_64;
-               sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring;
-               BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64,
+               sring_x86_64 = (struct blkif_x86_64_sring *)ring->blk_ring;
+               BACK_RING_INIT(&ring->blk_rings.x86_64, sring_x86_64,
                               XEN_PAGE_SIZE * nr_grefs);
                break;
        }
@@ -202,13 +230,13 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
 
        err = bind_interdomain_evtchn_to_irqhandler(blkif->domid, evtchn,
                                                    xen_blkif_be_int, 0,
-                                                   "blkif-backend", blkif);
+                                                   "blkif-backend", ring);
        if (err < 0) {
-               xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring);
-               blkif->blk_rings.common.sring = NULL;
+               xenbus_unmap_ring_vfree(blkif->be->dev, ring->blk_ring);
+               ring->blk_rings.common.sring = NULL;
                return err;
        }
-       blkif->irq = err;
+       ring->irq = err;
 
        return 0;
 }
@@ -216,50 +244,69 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
 static int xen_blkif_disconnect(struct xen_blkif *blkif)
 {
        struct pending_req *req, *n;
-       int i = 0, j;
+       unsigned int j, r;
 
-       if (blkif->xenblkd) {
-               kthread_stop(blkif->xenblkd);
-               wake_up(&blkif->shutdown_wq);
-               blkif->xenblkd = NULL;
-       }
+       for (r = 0; r < blkif->nr_rings; r++) {
+               struct xen_blkif_ring *ring = &blkif->rings[r];
+               unsigned int i = 0;
 
-       /* The above kthread_stop() guarantees that at this point we
-        * don't have any discard_io or other_io requests. So, checking
-        * for inflight IO is enough.
-        */
-       if (atomic_read(&blkif->inflight) > 0)
-               return -EBUSY;
+               if (ring->xenblkd) {
+                       kthread_stop(ring->xenblkd);
+                       wake_up(&ring->shutdown_wq);
+                       ring->xenblkd = NULL;
+               }
 
-       if (blkif->irq) {
-               unbind_from_irqhandler(blkif->irq, blkif);
-               blkif->irq = 0;
-       }
+               /* The above kthread_stop() guarantees that at this point we
+                * don't have any discard_io or other_io requests. So, checking
+                * for inflight IO is enough.
+                */
+               if (atomic_read(&ring->inflight) > 0)
+                       return -EBUSY;
 
-       if (blkif->blk_rings.common.sring) {
-               xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring);
-               blkif->blk_rings.common.sring = NULL;
-       }
+               if (ring->irq) {
+                       unbind_from_irqhandler(ring->irq, ring);
+                       ring->irq = 0;
+               }
 
-       /* Remove all persistent grants and the cache of ballooned pages. */
-       xen_blkbk_free_caches(blkif);
+               if (ring->blk_rings.common.sring) {
+                       xenbus_unmap_ring_vfree(blkif->be->dev, ring->blk_ring);
+                       ring->blk_rings.common.sring = NULL;
+               }
 
-       /* Check that there is no request in use */
-       list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) {
-               list_del(&req->free_list);
+               /* Remove all persistent grants and the cache of ballooned pages. */
+               xen_blkbk_free_caches(ring);
 
-               for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++)
-                       kfree(req->segments[j]);
+               /* Check that there is no request in use */
+               list_for_each_entry_safe(req, n, &ring->pending_free, free_list) {
+                       list_del(&req->free_list);
 
-               for (j = 0; j < MAX_INDIRECT_PAGES; j++)
-                       kfree(req->indirect_pages[j]);
+                       for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++)
+                               kfree(req->segments[j]);
 
-               kfree(req);
-               i++;
-       }
+                       for (j = 0; j < MAX_INDIRECT_PAGES; j++)
+                               kfree(req->indirect_pages[j]);
+
+                       kfree(req);
+                       i++;
+               }
 
-       WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages));
+               BUG_ON(atomic_read(&ring->persistent_gnt_in_use) != 0);
+               BUG_ON(!list_empty(&ring->persistent_purge_list));
+               BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts));
+               BUG_ON(!list_empty(&ring->free_pages));
+               BUG_ON(ring->free_pages_num != 0);
+               BUG_ON(ring->persistent_gnt_c != 0);
+               WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages));
+               xen_blkif_put(blkif);
+       }
        blkif->nr_ring_pages = 0;
+       /*
+        * blkif->rings was allocated in connect_ring, so we should free it in
+        * here.
+        */
+       kfree(blkif->rings);
+       blkif->rings = NULL;
+       blkif->nr_rings = 0;
 
        return 0;
 }
@@ -271,13 +318,6 @@ static void xen_blkif_free(struct xen_blkif *blkif)
        xen_vbd_free(&blkif->vbd);
 
        /* Make sure everything is drained before shutting down */
-       BUG_ON(blkif->persistent_gnt_c != 0);
-       BUG_ON(atomic_read(&blkif->persistent_gnt_in_use) != 0);
-       BUG_ON(blkif->free_pages_num != 0);
-       BUG_ON(!list_empty(&blkif->persistent_purge_list));
-       BUG_ON(!list_empty(&blkif->free_pages));
-       BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
-
        kmem_cache_free(xen_blkif_cachep, blkif);
 }
 
@@ -296,25 +336,38 @@ int __init xen_blkif_interface_init(void)
  *  sysfs interface for VBD I/O requests
  */
 
-#define VBD_SHOW(name, format, args...)                                        \
+#define VBD_SHOW_ALLRING(name, format)                                 \
        static ssize_t show_##name(struct device *_dev,                 \
                                   struct device_attribute *attr,       \
                                   char *buf)                           \
        {                                                               \
                struct xenbus_device *dev = to_xenbus_device(_dev);     \
                struct backend_info *be = dev_get_drvdata(&dev->dev);   \
+               struct xen_blkif *blkif = be->blkif;                    \
+               unsigned int i;                                         \
+               unsigned long long result = 0;                          \
                                                                        \
-               return sprintf(buf, format, ##args);                    \
+               if (!blkif->rings)                              \
+                       goto out;                                       \
+                                                                       \
+               for (i = 0; i < blkif->nr_rings; i++) {         \
+                       struct xen_blkif_ring *ring = &blkif->rings[i]; \
+                                                                       \
+                       result += ring->st_##name;                      \
+               }                                                       \
+                                                                       \
+out:                                                                   \
+               return sprintf(buf, format, result);                    \
        }                                                               \
        static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
 
-VBD_SHOW(oo_req,  "%llu\n", be->blkif->st_oo_req);
-VBD_SHOW(rd_req,  "%llu\n", be->blkif->st_rd_req);
-VBD_SHOW(wr_req,  "%llu\n", be->blkif->st_wr_req);
-VBD_SHOW(f_req,  "%llu\n", be->blkif->st_f_req);
-VBD_SHOW(ds_req,  "%llu\n", be->blkif->st_ds_req);
-VBD_SHOW(rd_sect, "%llu\n", be->blkif->st_rd_sect);
-VBD_SHOW(wr_sect, "%llu\n", be->blkif->st_wr_sect);
+VBD_SHOW_ALLRING(oo_req,  "%llu\n");
+VBD_SHOW_ALLRING(rd_req,  "%llu\n");
+VBD_SHOW_ALLRING(wr_req,  "%llu\n");
+VBD_SHOW_ALLRING(f_req,  "%llu\n");
+VBD_SHOW_ALLRING(ds_req,  "%llu\n");
+VBD_SHOW_ALLRING(rd_sect, "%llu\n");
+VBD_SHOW_ALLRING(wr_sect, "%llu\n");
 
 static struct attribute *xen_vbdstat_attrs[] = {
        &dev_attr_oo_req.attr,
@@ -332,6 +385,18 @@ static struct attribute_group xen_vbdstat_group = {
        .attrs = xen_vbdstat_attrs,
 };
 
+#define VBD_SHOW(name, format, args...)                                        \
+       static ssize_t show_##name(struct device *_dev,                 \
+                                  struct device_attribute *attr,       \
+                                  char *buf)                           \
+       {                                                               \
+               struct xenbus_device *dev = to_xenbus_device(_dev);     \
+               struct backend_info *be = dev_get_drvdata(&dev->dev);   \
+                                                                       \
+               return sprintf(buf, format, ##args);                    \
+       }                                                               \
+       static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+
 VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor);
 VBD_SHOW(mode, "%s\n", be->mode);
 
@@ -440,11 +505,11 @@ static int xen_blkbk_remove(struct xenbus_device *dev)
 
        dev_set_drvdata(&dev->dev, NULL);
 
-       if (be->blkif) {
+       if (be->blkif)
                xen_blkif_disconnect(be->blkif);
-               xen_blkif_put(be->blkif);
-       }
 
+       /* Put the reference we set in xen_blkif_alloc(). */
+       xen_blkif_put(be->blkif);
        kfree(be->mode);
        kfree(be);
        return 0;
@@ -553,6 +618,12 @@ static int xen_blkbk_probe(struct xenbus_device *dev,
                goto fail;
        }
 
+       /* Multi-queue: advertise how many queues are supported by us.*/
+       err = xenbus_printf(XBT_NIL, dev->nodename,
+                           "multi-queue-max-queues", "%u", xenblk_max_queues);
+       if (err)
+               pr_warn("Error writing multi-queue-max-queues\n");
+
        /* setup back pointer */
        be->blkif->be = be;
 
@@ -708,8 +779,14 @@ static void frontend_changed(struct xenbus_device *dev,
                }
 
                err = connect_ring(be);
-               if (err)
+               if (err) {
+                       /*
+                        * Clean up so that memory resources can be used by
+                        * other devices. connect_ring reported already error.
+                        */
+                       xen_blkif_disconnect(be->blkif);
                        break;
+               }
                xen_update_blkif_status(be->blkif);
                break;
 
@@ -825,50 +902,43 @@ again:
        xenbus_transaction_end(xbt, 1);
 }
 
-
-static int connect_ring(struct backend_info *be)
+/*
+ * Each ring may have multi pages, depends on "ring-page-order".
+ */
+static int read_per_ring_refs(struct xen_blkif_ring *ring, const char *dir)
 {
-       struct xenbus_device *dev = be->dev;
        unsigned int ring_ref[XENBUS_MAX_RING_GRANTS];
-       unsigned int evtchn, nr_grefs, ring_page_order;
-       unsigned int pers_grants;
-       char protocol[64] = "";
        struct pending_req *req, *n;
        int err, i, j;
+       struct xen_blkif *blkif = ring->blkif;
+       struct xenbus_device *dev = blkif->be->dev;
+       unsigned int ring_page_order, nr_grefs, evtchn;
 
-       pr_debug("%s %s\n", __func__, dev->otherend);
-
-       err = xenbus_scanf(XBT_NIL, dev->otherend, "event-channel", "%u",
+       err = xenbus_scanf(XBT_NIL, dir, "event-channel", "%u",
                          &evtchn);
        if (err != 1) {
                err = -EINVAL;
-               xenbus_dev_fatal(dev, err, "reading %s/event-channel",
-                                dev->otherend);
+               xenbus_dev_fatal(dev, err, "reading %s/event-channel", dir);
                return err;
        }
-       pr_info("event-channel %u\n", evtchn);
 
        err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-page-order", "%u",
                          &ring_page_order);
        if (err != 1) {
-               err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-ref",
-                                 "%u", &ring_ref[0]);
+               err = xenbus_scanf(XBT_NIL, dir, "ring-ref", "%u", &ring_ref[0]);
                if (err != 1) {
                        err = -EINVAL;
-                       xenbus_dev_fatal(dev, err, "reading %s/ring-ref",
-                                        dev->otherend);
+                       xenbus_dev_fatal(dev, err, "reading %s/ring-ref", dir);
                        return err;
                }
                nr_grefs = 1;
-               pr_info("%s:using single page: ring-ref %d\n", dev->otherend,
-                       ring_ref[0]);
        } else {
                unsigned int i;
 
                if (ring_page_order > xen_blkif_max_ring_order) {
                        err = -EINVAL;
                        xenbus_dev_fatal(dev, err, "%s/request %d ring page order exceed max:%d",
-                                        dev->otherend, ring_page_order,
+                                        dir, ring_page_order,
                                         xen_blkif_max_ring_order);
                        return err;
                }
@@ -878,52 +948,23 @@ static int connect_ring(struct backend_info *be)
                        char ring_ref_name[RINGREF_NAME_LEN];
 
                        snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
-                       err = xenbus_scanf(XBT_NIL, dev->otherend, ring_ref_name,
+                       err = xenbus_scanf(XBT_NIL, dir, ring_ref_name,
                                           "%u", &ring_ref[i]);
                        if (err != 1) {
                                err = -EINVAL;
                                xenbus_dev_fatal(dev, err, "reading %s/%s",
-                                                dev->otherend, ring_ref_name);
+                                                dir, ring_ref_name);
                                return err;
                        }
-                       pr_info("ring-ref%u: %u\n", i, ring_ref[i]);
                }
        }
-
-       be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
-       err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
-                           "%63s", protocol, NULL);
-       if (err)
-               strcpy(protocol, "unspecified, assuming default");
-       else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
-               be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
-       else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
-               be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
-       else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
-               be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
-       else {
-               xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
-               return -1;
-       }
-       err = xenbus_gather(XBT_NIL, dev->otherend,
-                           "feature-persistent", "%u",
-                           &pers_grants, NULL);
-       if (err)
-               pers_grants = 0;
-
-       be->blkif->vbd.feature_gnt_persistent = pers_grants;
-       be->blkif->vbd.overflow_max_grants = 0;
-       be->blkif->nr_ring_pages = nr_grefs;
-
-       pr_info("ring-pages:%d, event-channel %d, protocol %d (%s) %s\n",
-               nr_grefs, evtchn, be->blkif->blk_protocol, protocol,
-               pers_grants ? "persistent grants" : "");
+       blkif->nr_ring_pages = nr_grefs;
 
        for (i = 0; i < nr_grefs * XEN_BLKIF_REQS_PER_PAGE; i++) {
                req = kzalloc(sizeof(*req), GFP_KERNEL);
                if (!req)
                        goto fail;
-               list_add_tail(&req->free_list, &be->blkif->pending_free);
+               list_add_tail(&req->free_list, &ring->pending_free);
                for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
                        req->segments[j] = kzalloc(sizeof(*req->segments[0]), GFP_KERNEL);
                        if (!req->segments[j])
@@ -938,7 +979,7 @@ static int connect_ring(struct backend_info *be)
        }
 
        /* Map the shared frame, irq etc. */
-       err = xen_blkif_map(be->blkif, ring_ref, nr_grefs, evtchn);
+       err = xen_blkif_map(ring, ring_ref, nr_grefs, evtchn);
        if (err) {
                xenbus_dev_fatal(dev, err, "mapping ring-ref port %u", evtchn);
                return err;
@@ -947,7 +988,7 @@ static int connect_ring(struct backend_info *be)
        return 0;
 
 fail:
-       list_for_each_entry_safe(req, n, &be->blkif->pending_free, free_list) {
+       list_for_each_entry_safe(req, n, &ring->pending_free, free_list) {
                list_del(&req->free_list);
                for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
                        if (!req->segments[j])
@@ -962,6 +1003,93 @@ fail:
                kfree(req);
        }
        return -ENOMEM;
+
+}
+
+static int connect_ring(struct backend_info *be)
+{
+       struct xenbus_device *dev = be->dev;
+       unsigned int pers_grants;
+       char protocol[64] = "";
+       int err, i;
+       char *xspath;
+       size_t xspathsize;
+       const size_t xenstore_path_ext_size = 11; /* sufficient for "/queue-NNN" */
+       unsigned int requested_num_queues = 0;
+
+       pr_debug("%s %s\n", __func__, dev->otherend);
+
+       be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
+       err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
+                           "%63s", protocol, NULL);
+       if (err)
+               strcpy(protocol, "unspecified, assuming default");
+       else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
+               be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
+       else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
+               be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
+       else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
+               be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
+       else {
+               xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
+               return -ENOSYS;
+       }
+       err = xenbus_gather(XBT_NIL, dev->otherend,
+                           "feature-persistent", "%u",
+                           &pers_grants, NULL);
+       if (err)
+               pers_grants = 0;
+
+       be->blkif->vbd.feature_gnt_persistent = pers_grants;
+       be->blkif->vbd.overflow_max_grants = 0;
+
+       /*
+        * Read the number of hardware queues from frontend.
+        */
+       err = xenbus_scanf(XBT_NIL, dev->otherend, "multi-queue-num-queues",
+                          "%u", &requested_num_queues);
+       if (err < 0) {
+               requested_num_queues = 1;
+       } else {
+               if (requested_num_queues > xenblk_max_queues
+                   || requested_num_queues == 0) {
+                       /* Buggy or malicious guest. */
+                       xenbus_dev_fatal(dev, err,
+                                       "guest requested %u queues, exceeding the maximum of %u.",
+                                       requested_num_queues, xenblk_max_queues);
+                       return -ENOSYS;
+               }
+       }
+       be->blkif->nr_rings = requested_num_queues;
+       if (xen_blkif_alloc_rings(be->blkif))
+               return -ENOMEM;
+
+       pr_info("%s: using %d queues, protocol %d (%s) %s\n", dev->nodename,
+                be->blkif->nr_rings, be->blkif->blk_protocol, protocol,
+                pers_grants ? "persistent grants" : "");
+
+       if (be->blkif->nr_rings == 1)
+               return read_per_ring_refs(&be->blkif->rings[0], dev->otherend);
+       else {
+               xspathsize = strlen(dev->otherend) + xenstore_path_ext_size;
+               xspath = kmalloc(xspathsize, GFP_KERNEL);
+               if (!xspath) {
+                       xenbus_dev_fatal(dev, -ENOMEM, "reading ring references");
+                       return -ENOMEM;
+               }
+
+               for (i = 0; i < be->blkif->nr_rings; i++) {
+                       memset(xspath, 0, xspathsize);
+                       snprintf(xspath, xspathsize, "%s/queue-%u", dev->otherend, i);
+                       err = read_per_ring_refs(&be->blkif->rings[i], xspath);
+                       if (err) {
+                               kfree(xspath);
+                               return err;
+                       }
+               }
+               kfree(xspath);
+       }
+       return 0;
 }
 
 static const struct xenbus_device_id xen_blkbk_ids[] = {
index 2fee2eef988d2992480363ea2578ddb9edf17240..8a8dc91c39f7292be4fc0b1c172e12e97f5fed5c 100644 (file)
 
 #include <asm/xen/hypervisor.h>
 
+/*
+ * The minimal size of segment supported by the block framework is PAGE_SIZE.
+ * When Linux is using a different page size than Xen, it may not be possible
+ * to put all the data in a single segment.
+ * This can happen when the backend doesn't support indirect descriptor and
+ * therefore the maximum amount of data that a request can carry is
+ * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE = 44KB
+ *
+ * Note that we only support one extra request. So the Linux page size
+ * should be <= ( 2 * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) =
+ * 88KB.
+ */
+#define HAS_EXTRA_REQ (BLKIF_MAX_SEGMENTS_PER_REQUEST < XEN_PFN_PER_PAGE)
+
 enum blkif_state {
        BLKIF_STATE_DISCONNECTED,
        BLKIF_STATE_CONNECTED,
@@ -72,6 +86,13 @@ struct grant {
        struct list_head node;
 };
 
+enum blk_req_status {
+       REQ_WAITING,
+       REQ_DONE,
+       REQ_ERROR,
+       REQ_EOPNOTSUPP,
+};
+
 struct blk_shadow {
        struct blkif_request req;
        struct request *request;
@@ -79,6 +100,14 @@ struct blk_shadow {
        struct grant **indirect_grants;
        struct scatterlist *sg;
        unsigned int num_sg;
+       enum blk_req_status status;
+
+       #define NO_ASSOCIATED_ID ~0UL
+       /*
+        * Id of the sibling if we ever need 2 requests when handling a
+        * block I/O request
+        */
+       unsigned long associated_id;
 };
 
 struct split_bio {
@@ -99,6 +128,10 @@ static unsigned int xen_blkif_max_segments = 32;
 module_param_named(max, xen_blkif_max_segments, int, S_IRUGO);
 MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)");
 
+static unsigned int xen_blkif_max_queues = 4;
+module_param_named(max_queues, xen_blkif_max_queues, uint, S_IRUGO);
+MODULE_PARM_DESC(max_queues, "Maximum number of hardware queues/rings used per virtual disk");
+
 /*
  * Maximum order of pages to be used for the shared ring between front and
  * backend, 4KB page granularity is used.
@@ -114,10 +147,35 @@ MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the
        __CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * XENBUS_MAX_RING_GRANTS)
 
 /*
- * ring-ref%i i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19
- * characters are enough. Define to 20 to keep consist with backend.
+ * ring-ref%u i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19
+ * characters are enough. Define to 20 to keep consistent with backend.
  */
 #define RINGREF_NAME_LEN (20)
+/*
+ * queue-%u would take 7 + 10(UINT_MAX) = 17 characters.
+ */
+#define QUEUE_NAME_LEN (17)
+
+/*
+ *  Per-ring info.
+ *  Every blkfront device can associate with one or more blkfront_ring_info,
+ *  depending on how many hardware queues/rings to be used.
+ */
+struct blkfront_ring_info {
+       /* Lock to protect data in every ring buffer. */
+       spinlock_t ring_lock;
+       struct blkif_front_ring ring;
+       unsigned int ring_ref[XENBUS_MAX_RING_GRANTS];
+       unsigned int evtchn, irq;
+       struct work_struct work;
+       struct gnttab_free_callback callback;
+       struct blk_shadow shadow[BLK_MAX_RING_SIZE];
+       struct list_head indirect_pages;
+       struct list_head grants;
+       unsigned int persistent_gnts_c;
+       unsigned long shadow_free;
+       struct blkfront_info *dev_info;
+};
 
 /*
  * We have one of these per vbd, whether ide, scsi or 'other'.  They
@@ -126,25 +184,15 @@ MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the
  */
 struct blkfront_info
 {
-       spinlock_t io_lock;
        struct mutex mutex;
        struct xenbus_device *xbdev;
        struct gendisk *gd;
        int vdevice;
        blkif_vdev_t handle;
        enum blkif_state connected;
-       int ring_ref[XENBUS_MAX_RING_GRANTS];
+       /* Number of pages per ring buffer. */
        unsigned int nr_ring_pages;
-       struct blkif_front_ring ring;
-       unsigned int evtchn, irq;
        struct request_queue *rq;
-       struct work_struct work;
-       struct gnttab_free_callback callback;
-       struct blk_shadow shadow[BLK_MAX_RING_SIZE];
-       struct list_head grants;
-       struct list_head indirect_pages;
-       unsigned int persistent_gnts_c;
-       unsigned long shadow_free;
        unsigned int feature_flush;
        unsigned int feature_discard:1;
        unsigned int feature_secdiscard:1;
@@ -155,6 +203,8 @@ struct blkfront_info
        unsigned int max_indirect_segments;
        int is_ready;
        struct blk_mq_tag_set tag_set;
+       struct blkfront_ring_info *rinfo;
+       unsigned int nr_rings;
 };
 
 static unsigned int nr_minors;
@@ -198,38 +248,40 @@ static DEFINE_SPINLOCK(minor_lock);
 
 #define GREFS(_psegs)  ((_psegs) * GRANTS_PER_PSEG)
 
-static int blkfront_setup_indirect(struct blkfront_info *info);
-static int blkfront_gather_backend_features(struct blkfront_info *info);
+static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo);
+static void blkfront_gather_backend_features(struct blkfront_info *info);
 
-static int get_id_from_freelist(struct blkfront_info *info)
+static int get_id_from_freelist(struct blkfront_ring_info *rinfo)
 {
-       unsigned long free = info->shadow_free;
-       BUG_ON(free >= BLK_RING_SIZE(info));
-       info->shadow_free = info->shadow[free].req.u.rw.id;
-       info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */
+       unsigned long free = rinfo->shadow_free;
+
+       BUG_ON(free >= BLK_RING_SIZE(rinfo->dev_info));
+       rinfo->shadow_free = rinfo->shadow[free].req.u.rw.id;
+       rinfo->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */
        return free;
 }
 
-static int add_id_to_freelist(struct blkfront_info *info,
-                              unsigned long id)
+static int add_id_to_freelist(struct blkfront_ring_info *rinfo,
+                             unsigned long id)
 {
-       if (info->shadow[id].req.u.rw.id != id)
+       if (rinfo->shadow[id].req.u.rw.id != id)
                return -EINVAL;
-       if (info->shadow[id].request == NULL)
+       if (rinfo->shadow[id].request == NULL)
                return -EINVAL;
-       info->shadow[id].req.u.rw.id  = info->shadow_free;
-       info->shadow[id].request = NULL;
-       info->shadow_free = id;
+       rinfo->shadow[id].req.u.rw.id  = rinfo->shadow_free;
+       rinfo->shadow[id].request = NULL;
+       rinfo->shadow_free = id;
        return 0;
 }
 
-static int fill_grant_buffer(struct blkfront_info *info, int num)
+static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num)
 {
+       struct blkfront_info *info = rinfo->dev_info;
        struct page *granted_page;
        struct grant *gnt_list_entry, *n;
        int i = 0;
 
-       while(i < num) {
+       while (i < num) {
                gnt_list_entry = kzalloc(sizeof(struct grant), GFP_NOIO);
                if (!gnt_list_entry)
                        goto out_of_memory;
@@ -244,7 +296,7 @@ static int fill_grant_buffer(struct blkfront_info *info, int num)
                }
 
                gnt_list_entry->gref = GRANT_INVALID_REF;
-               list_add(&gnt_list_entry->node, &info->grants);
+               list_add(&gnt_list_entry->node, &rinfo->grants);
                i++;
        }
 
@@ -252,7 +304,7 @@ static int fill_grant_buffer(struct blkfront_info *info, int num)
 
 out_of_memory:
        list_for_each_entry_safe(gnt_list_entry, n,
-                                &info->grants, node) {
+                                &rinfo->grants, node) {
                list_del(&gnt_list_entry->node);
                if (info->feature_persistent)
                        __free_page(gnt_list_entry->page);
@@ -263,17 +315,17 @@ out_of_memory:
        return -ENOMEM;
 }
 
-static struct grant *get_free_grant(struct blkfront_info *info)
+static struct grant *get_free_grant(struct blkfront_ring_info *rinfo)
 {
        struct grant *gnt_list_entry;
 
-       BUG_ON(list_empty(&info->grants));
-       gnt_list_entry = list_first_entry(&info->grants, struct grant,
+       BUG_ON(list_empty(&rinfo->grants));
+       gnt_list_entry = list_first_entry(&rinfo->grants, struct grant,
                                          node);
        list_del(&gnt_list_entry->node);
 
        if (gnt_list_entry->gref != GRANT_INVALID_REF)
-               info->persistent_gnts_c--;
+               rinfo->persistent_gnts_c--;
 
        return gnt_list_entry;
 }
@@ -289,9 +341,10 @@ static inline void grant_foreign_access(const struct grant *gnt_list_entry,
 
 static struct grant *get_grant(grant_ref_t *gref_head,
                               unsigned long gfn,
-                              struct blkfront_info *info)
+                              struct blkfront_ring_info *rinfo)
 {
-       struct grant *gnt_list_entry = get_free_grant(info);
+       struct grant *gnt_list_entry = get_free_grant(rinfo);
+       struct blkfront_info *info = rinfo->dev_info;
 
        if (gnt_list_entry->gref != GRANT_INVALID_REF)
                return gnt_list_entry;
@@ -312,9 +365,10 @@ static struct grant *get_grant(grant_ref_t *gref_head,
 }
 
 static struct grant *get_indirect_grant(grant_ref_t *gref_head,
-                                       struct blkfront_info *info)
+                                       struct blkfront_ring_info *rinfo)
 {
-       struct grant *gnt_list_entry = get_free_grant(info);
+       struct grant *gnt_list_entry = get_free_grant(rinfo);
+       struct blkfront_info *info = rinfo->dev_info;
 
        if (gnt_list_entry->gref != GRANT_INVALID_REF)
                return gnt_list_entry;
@@ -326,8 +380,8 @@ static struct grant *get_indirect_grant(grant_ref_t *gref_head,
                struct page *indirect_page;
 
                /* Fetch a pre-allocated page to use for indirect grefs */
-               BUG_ON(list_empty(&info->indirect_pages));
-               indirect_page = list_first_entry(&info->indirect_pages,
+               BUG_ON(list_empty(&rinfo->indirect_pages));
+               indirect_page = list_first_entry(&rinfo->indirect_pages,
                                                 struct page, lru);
                list_del(&indirect_page->lru);
                gnt_list_entry->page = indirect_page;
@@ -403,8 +457,8 @@ static void xlbd_release_minors(unsigned int minor, unsigned int nr)
 
 static void blkif_restart_queue_callback(void *arg)
 {
-       struct blkfront_info *info = (struct blkfront_info *)arg;
-       schedule_work(&info->work);
+       struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)arg;
+       schedule_work(&rinfo->work);
 }
 
 static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg)
@@ -456,16 +510,33 @@ static int blkif_ioctl(struct block_device *bdev, fmode_t mode,
        return 0;
 }
 
-static int blkif_queue_discard_req(struct request *req)
+static unsigned long blkif_ring_get_request(struct blkfront_ring_info *rinfo,
+                                           struct request *req,
+                                           struct blkif_request **ring_req)
 {
-       struct blkfront_info *info = req->rq_disk->private_data;
+       unsigned long id;
+
+       *ring_req = RING_GET_REQUEST(&rinfo->ring, rinfo->ring.req_prod_pvt);
+       rinfo->ring.req_prod_pvt++;
+
+       id = get_id_from_freelist(rinfo);
+       rinfo->shadow[id].request = req;
+       rinfo->shadow[id].status = REQ_WAITING;
+       rinfo->shadow[id].associated_id = NO_ASSOCIATED_ID;
+
+       (*ring_req)->u.rw.id = id;
+
+       return id;
+}
+
+static int blkif_queue_discard_req(struct request *req, struct blkfront_ring_info *rinfo)
+{
+       struct blkfront_info *info = rinfo->dev_info;
        struct blkif_request *ring_req;
        unsigned long id;
 
        /* Fill out a communications ring structure. */
-       ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
-       id = get_id_from_freelist(info);
-       info->shadow[id].request = req;
+       id = blkif_ring_get_request(rinfo, req, &ring_req);
 
        ring_req->operation = BLKIF_OP_DISCARD;
        ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
@@ -476,10 +547,8 @@ static int blkif_queue_discard_req(struct request *req)
        else
                ring_req->u.discard.flag = 0;
 
-       info->ring.req_prod_pvt++;
-
        /* Keep a private copy so we can reissue requests when recovering. */
-       info->shadow[id].req = *ring_req;
+       rinfo->shadow[id].req = *ring_req;
 
        return 0;
 }
@@ -487,7 +556,7 @@ static int blkif_queue_discard_req(struct request *req)
 struct setup_rw_req {
        unsigned int grant_idx;
        struct blkif_request_segment *segments;
-       struct blkfront_info *info;
+       struct blkfront_ring_info *rinfo;
        struct blkif_request *ring_req;
        grant_ref_t gref_head;
        unsigned int id;
@@ -495,6 +564,9 @@ struct setup_rw_req {
        bool need_copy;
        unsigned int bvec_off;
        char *bvec_data;
+
+       bool require_extra_req;
+       struct blkif_request *extra_ring_req;
 };
 
 static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
@@ -507,8 +579,24 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
        /* Convenient aliases */
        unsigned int grant_idx = setup->grant_idx;
        struct blkif_request *ring_req = setup->ring_req;
-       struct blkfront_info *info = setup->info;
-       struct blk_shadow *shadow = &info->shadow[setup->id];
+       struct blkfront_ring_info *rinfo = setup->rinfo;
+       /*
+        * We always use the shadow of the first request to store the list
+        * of grant associated to the block I/O request. This made the
+        * completion more easy to handle even if the block I/O request is
+        * split.
+        */
+       struct blk_shadow *shadow = &rinfo->shadow[setup->id];
+
+       if (unlikely(setup->require_extra_req &&
+                    grant_idx >= BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
+               /*
+                * We are using the second request, setup grant_idx
+                * to be the index of the segment array.
+                */
+               grant_idx -= BLKIF_MAX_SEGMENTS_PER_REQUEST;
+               ring_req = setup->extra_ring_req;
+       }
 
        if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
            (grant_idx % GRANTS_PER_INDIRECT_FRAME == 0)) {
@@ -516,15 +604,19 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
                        kunmap_atomic(setup->segments);
 
                n = grant_idx / GRANTS_PER_INDIRECT_FRAME;
-               gnt_list_entry = get_indirect_grant(&setup->gref_head, info);
+               gnt_list_entry = get_indirect_grant(&setup->gref_head, rinfo);
                shadow->indirect_grants[n] = gnt_list_entry;
                setup->segments = kmap_atomic(gnt_list_entry->page);
                ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref;
        }
 
-       gnt_list_entry = get_grant(&setup->gref_head, gfn, info);
+       gnt_list_entry = get_grant(&setup->gref_head, gfn, rinfo);
        ref = gnt_list_entry->gref;
-       shadow->grants_used[grant_idx] = gnt_list_entry;
+       /*
+        * All the grants are stored in the shadow of the first
+        * request. Therefore we have to use the global index.
+        */
+       shadow->grants_used[setup->grant_idx] = gnt_list_entry;
 
        if (setup->need_copy) {
                void *shared_data;
@@ -566,16 +658,36 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
        (setup->grant_idx)++;
 }
 
-static int blkif_queue_rw_req(struct request *req)
+static void blkif_setup_extra_req(struct blkif_request *first,
+                                 struct blkif_request *second)
 {
-       struct blkfront_info *info = req->rq_disk->private_data;
-       struct blkif_request *ring_req;
-       unsigned long id;
+       uint16_t nr_segments = first->u.rw.nr_segments;
+
+       /*
+        * The second request is only present when the first request uses
+        * all its segments. It's always the continuity of the first one.
+        */
+       first->u.rw.nr_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+
+       second->u.rw.nr_segments = nr_segments - BLKIF_MAX_SEGMENTS_PER_REQUEST;
+       second->u.rw.sector_number = first->u.rw.sector_number +
+               (BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) / 512;
+
+       second->u.rw.handle = first->u.rw.handle;
+       second->operation = first->operation;
+}
+
+static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *rinfo)
+{
+       struct blkfront_info *info = rinfo->dev_info;
+       struct blkif_request *ring_req, *extra_ring_req = NULL;
+       unsigned long id, extra_id = NO_ASSOCIATED_ID;
+       bool require_extra_req = false;
        int i;
        struct setup_rw_req setup = {
                .grant_idx = 0,
                .segments = NULL,
-               .info = info,
+               .rinfo = rinfo,
                .need_copy = rq_data_dir(req) && info->feature_persistent,
        };
 
@@ -584,7 +696,6 @@ static int blkif_queue_rw_req(struct request *req)
         * existing persistent grants, or if we have to get new grants,
         * as there are not sufficiently many free.
         */
-       bool new_persistent_gnts;
        struct scatterlist *sg;
        int num_sg, max_grefs, num_grant;
 
@@ -596,41 +707,36 @@ static int blkif_queue_rw_req(struct request *req)
                 */
                max_grefs += INDIRECT_GREFS(max_grefs);
 
-       /* Check if we have enough grants to allocate a requests */
-       if (info->persistent_gnts_c < max_grefs) {
-               new_persistent_gnts = 1;
-               if (gnttab_alloc_grant_references(
-                   max_grefs - info->persistent_gnts_c,
-                   &setup.gref_head) < 0) {
+       /*
+        * We have to reserve 'max_grefs' grants because persistent
+        * grants are shared by all rings.
+        */
+       if (max_grefs > 0)
+               if (gnttab_alloc_grant_references(max_grefs, &setup.gref_head) < 0) {
                        gnttab_request_free_callback(
-                               &info->callback,
+                               &rinfo->callback,
                                blkif_restart_queue_callback,
-                               info,
+                               rinfo,
                                max_grefs);
                        return 1;
                }
-       } else
-               new_persistent_gnts = 0;
 
        /* Fill out a communications ring structure. */
-       ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
-       id = get_id_from_freelist(info);
-       info->shadow[id].request = req;
-
-       BUG_ON(info->max_indirect_segments == 0 &&
-              GREFS(req->nr_phys_segments) > BLKIF_MAX_SEGMENTS_PER_REQUEST);
-       BUG_ON(info->max_indirect_segments &&
-              GREFS(req->nr_phys_segments) > info->max_indirect_segments);
+       id = blkif_ring_get_request(rinfo, req, &ring_req);
 
-       num_sg = blk_rq_map_sg(req->q, req, info->shadow[id].sg);
+       num_sg = blk_rq_map_sg(req->q, req, rinfo->shadow[id].sg);
        num_grant = 0;
        /* Calculate the number of grant used */
-       for_each_sg(info->shadow[id].sg, sg, num_sg, i)
+       for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i)
               num_grant += gnttab_count_grant(sg->offset, sg->length);
 
-       ring_req->u.rw.id = id;
-       info->shadow[id].num_sg = num_sg;
-       if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
+       require_extra_req = info->max_indirect_segments == 0 &&
+               num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST;
+       BUG_ON(!HAS_EXTRA_REQ && require_extra_req);
+
+       rinfo->shadow[id].num_sg = num_sg;
+       if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST &&
+           likely(!require_extra_req)) {
                /*
                 * The indirect operation can only be a BLKIF_OP_READ or
                 * BLKIF_OP_WRITE
@@ -670,11 +776,31 @@ static int blkif_queue_rw_req(struct request *req)
                        }
                }
                ring_req->u.rw.nr_segments = num_grant;
+               if (unlikely(require_extra_req)) {
+                       extra_id = blkif_ring_get_request(rinfo, req,
+                                                         &extra_ring_req);
+                       /*
+                        * Only the first request contains the scatter-gather
+                        * list.
+                        */
+                       rinfo->shadow[extra_id].num_sg = 0;
+
+                       blkif_setup_extra_req(ring_req, extra_ring_req);
+
+                       /* Link the 2 requests together */
+                       rinfo->shadow[extra_id].associated_id = id;
+                       rinfo->shadow[id].associated_id = extra_id;
+               }
        }
 
        setup.ring_req = ring_req;
        setup.id = id;
-       for_each_sg(info->shadow[id].sg, sg, num_sg, i) {
+
+       setup.require_extra_req = require_extra_req;
+       if (unlikely(require_extra_req))
+               setup.extra_ring_req = extra_ring_req;
+
+       for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) {
                BUG_ON(sg->offset + sg->length > PAGE_SIZE);
 
                if (setup.need_copy) {
@@ -694,12 +820,12 @@ static int blkif_queue_rw_req(struct request *req)
        if (setup.segments)
                kunmap_atomic(setup.segments);
 
-       info->ring.req_prod_pvt++;
-
        /* Keep a private copy so we can reissue requests when recovering. */
-       info->shadow[id].req = *ring_req;
+       rinfo->shadow[id].req = *ring_req;
+       if (unlikely(require_extra_req))
+               rinfo->shadow[extra_id].req = *extra_ring_req;
 
-       if (new_persistent_gnts)
+       if (max_grefs > 0)
                gnttab_free_grant_references(setup.gref_head);
 
        return 0;
@@ -711,27 +837,25 @@ static int blkif_queue_rw_req(struct request *req)
  *
  * @req: a request struct
  */
-static int blkif_queue_request(struct request *req)
+static int blkif_queue_request(struct request *req, struct blkfront_ring_info *rinfo)
 {
-       struct blkfront_info *info = req->rq_disk->private_data;
-
-       if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
+       if (unlikely(rinfo->dev_info->connected != BLKIF_STATE_CONNECTED))
                return 1;
 
        if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE)))
-               return blkif_queue_discard_req(req);
+               return blkif_queue_discard_req(req, rinfo);
        else
-               return blkif_queue_rw_req(req);
+               return blkif_queue_rw_req(req, rinfo);
 }
 
-static inline void flush_requests(struct blkfront_info *info)
+static inline void flush_requests(struct blkfront_ring_info *rinfo)
 {
        int notify;
 
-       RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
+       RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&rinfo->ring, notify);
 
        if (notify)
-               notify_remote_via_irq(info->irq);
+               notify_remote_via_irq(rinfo->irq);
 }
 
 static inline bool blkif_request_flush_invalid(struct request *req,
@@ -745,38 +869,50 @@ static inline bool blkif_request_flush_invalid(struct request *req,
 }
 
 static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx,
-                          const struct blk_mq_queue_data *qd)
+                         const struct blk_mq_queue_data *qd)
 {
-       struct blkfront_info *info = qd->rq->rq_disk->private_data;
+       unsigned long flags;
+       struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)hctx->driver_data;
 
        blk_mq_start_request(qd->rq);
-       spin_lock_irq(&info->io_lock);
-       if (RING_FULL(&info->ring))
+       spin_lock_irqsave(&rinfo->ring_lock, flags);
+       if (RING_FULL(&rinfo->ring))
                goto out_busy;
 
-       if (blkif_request_flush_invalid(qd->rq, info))
+       if (blkif_request_flush_invalid(qd->rq, rinfo->dev_info))
                goto out_err;
 
-       if (blkif_queue_request(qd->rq))
+       if (blkif_queue_request(qd->rq, rinfo))
                goto out_busy;
 
-       flush_requests(info);
-       spin_unlock_irq(&info->io_lock);
+       flush_requests(rinfo);
+       spin_unlock_irqrestore(&rinfo->ring_lock, flags);
        return BLK_MQ_RQ_QUEUE_OK;
 
 out_err:
-       spin_unlock_irq(&info->io_lock);
+       spin_unlock_irqrestore(&rinfo->ring_lock, flags);
        return BLK_MQ_RQ_QUEUE_ERROR;
 
 out_busy:
-       spin_unlock_irq(&info->io_lock);
+       spin_unlock_irqrestore(&rinfo->ring_lock, flags);
        blk_mq_stop_hw_queue(hctx);
        return BLK_MQ_RQ_QUEUE_BUSY;
 }
 
+static int blk_mq_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+                           unsigned int index)
+{
+       struct blkfront_info *info = (struct blkfront_info *)data;
+
+       BUG_ON(info->nr_rings <= index);
+       hctx->driver_data = &info->rinfo[index];
+       return 0;
+}
+
 static struct blk_mq_ops blkfront_mq_ops = {
        .queue_rq = blkif_queue_rq,
        .map_queue = blk_mq_map_queue,
+       .init_hctx = blk_mq_init_hctx,
 };
 
 static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
@@ -788,19 +924,28 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
 
        memset(&info->tag_set, 0, sizeof(info->tag_set));
        info->tag_set.ops = &blkfront_mq_ops;
-       info->tag_set.nr_hw_queues = 1;
-       info->tag_set.queue_depth =  BLK_RING_SIZE(info);
+       info->tag_set.nr_hw_queues = info->nr_rings;
+       if (HAS_EXTRA_REQ && info->max_indirect_segments == 0) {
+               /*
+                * When indirect descriptior is not supported, the I/O request
+                * will be split between multiple request in the ring.
+                * To avoid problems when sending the request, divide by
+                * 2 the depth of the queue.
+                */
+               info->tag_set.queue_depth =  BLK_RING_SIZE(info) / 2;
+       } else
+               info->tag_set.queue_depth = BLK_RING_SIZE(info);
        info->tag_set.numa_node = NUMA_NO_NODE;
        info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
        info->tag_set.cmd_size = 0;
        info->tag_set.driver_data = info;
 
        if (blk_mq_alloc_tag_set(&info->tag_set))
-               return -1;
+               return -EINVAL;
        rq = blk_mq_init_queue(&info->tag_set);
        if (IS_ERR(rq)) {
                blk_mq_free_tag_set(&info->tag_set);
-               return -1;
+               return PTR_ERR(rq);
        }
 
        queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
@@ -1028,7 +1173,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 
 static void xlvbd_release_gendisk(struct blkfront_info *info)
 {
-       unsigned int minor, nr_minors;
+       unsigned int minor, nr_minors, i;
 
        if (info->rq == NULL)
                return;
@@ -1036,11 +1181,15 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
        /* No more blkif_request(). */
        blk_mq_stop_hw_queues(info->rq);
 
-       /* No more gnttab callback work. */
-       gnttab_cancel_free_callback(&info->callback);
+       for (i = 0; i < info->nr_rings; i++) {
+               struct blkfront_ring_info *rinfo = &info->rinfo[i];
 
-       /* Flush gnttab callback work. Must be done with no locks held. */
-       flush_work(&info->work);
+               /* No more gnttab callback work. */
+               gnttab_cancel_free_callback(&rinfo->callback);
+
+               /* Flush gnttab callback work. Must be done with no locks held. */
+               flush_work(&rinfo->work);
+       }
 
        del_gendisk(info->gd);
 
@@ -1056,88 +1205,87 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
        info->gd = NULL;
 }
 
-/* Must be called with io_lock holded */
-static void kick_pending_request_queues(struct blkfront_info *info)
+/* Already hold rinfo->ring_lock. */
+static inline void kick_pending_request_queues_locked(struct blkfront_ring_info *rinfo)
 {
-       if (!RING_FULL(&info->ring))
-               blk_mq_start_stopped_hw_queues(info->rq, true);
+       if (!RING_FULL(&rinfo->ring))
+               blk_mq_start_stopped_hw_queues(rinfo->dev_info->rq, true);
 }
 
-static void blkif_restart_queue(struct work_struct *work)
+static void kick_pending_request_queues(struct blkfront_ring_info *rinfo)
 {
-       struct blkfront_info *info = container_of(work, struct blkfront_info, work);
+       unsigned long flags;
 
-       spin_lock_irq(&info->io_lock);
-       if (info->connected == BLKIF_STATE_CONNECTED)
-               kick_pending_request_queues(info);
-       spin_unlock_irq(&info->io_lock);
+       spin_lock_irqsave(&rinfo->ring_lock, flags);
+       kick_pending_request_queues_locked(rinfo);
+       spin_unlock_irqrestore(&rinfo->ring_lock, flags);
 }
 
-static void blkif_free(struct blkfront_info *info, int suspend)
+static void blkif_restart_queue(struct work_struct *work)
 {
-       struct grant *persistent_gnt;
-       struct grant *n;
-       int i, j, segs;
+       struct blkfront_ring_info *rinfo = container_of(work, struct blkfront_ring_info, work);
 
-       /* Prevent new requests being issued until we fix things up. */
-       spin_lock_irq(&info->io_lock);
-       info->connected = suspend ?
-               BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
-       /* No more blkif_request(). */
-       if (info->rq)
-               blk_mq_stop_hw_queues(info->rq);
+       if (rinfo->dev_info->connected == BLKIF_STATE_CONNECTED)
+               kick_pending_request_queues(rinfo);
+}
 
-       /* Remove all persistent grants */
-       if (!list_empty(&info->grants)) {
-               list_for_each_entry_safe(persistent_gnt, n,
-                                        &info->grants, node) {
-                       list_del(&persistent_gnt->node);
-                       if (persistent_gnt->gref != GRANT_INVALID_REF) {
-                               gnttab_end_foreign_access(persistent_gnt->gref,
-                                                         0, 0UL);
-                               info->persistent_gnts_c--;
-                       }
-                       if (info->feature_persistent)
-                               __free_page(persistent_gnt->page);
-                       kfree(persistent_gnt);
-               }
-       }
-       BUG_ON(info->persistent_gnts_c != 0);
+static void blkif_free_ring(struct blkfront_ring_info *rinfo)
+{
+       struct grant *persistent_gnt, *n;
+       struct blkfront_info *info = rinfo->dev_info;
+       int i, j, segs;
 
        /*
         * Remove indirect pages, this only happens when using indirect
         * descriptors but not persistent grants
         */
-       if (!list_empty(&info->indirect_pages)) {
+       if (!list_empty(&rinfo->indirect_pages)) {
                struct page *indirect_page, *n;
 
                BUG_ON(info->feature_persistent);
-               list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) {
+               list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) {
                        list_del(&indirect_page->lru);
                        __free_page(indirect_page);
                }
        }
 
+       /* Remove all persistent grants. */
+       if (!list_empty(&rinfo->grants)) {
+               list_for_each_entry_safe(persistent_gnt, n,
+                                        &rinfo->grants, node) {
+                       list_del(&persistent_gnt->node);
+                       if (persistent_gnt->gref != GRANT_INVALID_REF) {
+                               gnttab_end_foreign_access(persistent_gnt->gref,
+                                                         0, 0UL);
+                               rinfo->persistent_gnts_c--;
+                       }
+                       if (info->feature_persistent)
+                               __free_page(persistent_gnt->page);
+                       kfree(persistent_gnt);
+               }
+       }
+       BUG_ON(rinfo->persistent_gnts_c != 0);
+
        for (i = 0; i < BLK_RING_SIZE(info); i++) {
                /*
                 * Clear persistent grants present in requests already
                 * on the shared ring
                 */
-               if (!info->shadow[i].request)
+               if (!rinfo->shadow[i].request)
                        goto free_shadow;
 
-               segs = info->shadow[i].req.operation == BLKIF_OP_INDIRECT ?
-                      info->shadow[i].req.u.indirect.nr_segments :
-                      info->shadow[i].req.u.rw.nr_segments;
+               segs = rinfo->shadow[i].req.operation == BLKIF_OP_INDIRECT ?
+                      rinfo->shadow[i].req.u.indirect.nr_segments :
+                      rinfo->shadow[i].req.u.rw.nr_segments;
                for (j = 0; j < segs; j++) {
-                       persistent_gnt = info->shadow[i].grants_used[j];
+                       persistent_gnt = rinfo->shadow[i].grants_used[j];
                        gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
                        if (info->feature_persistent)
                                __free_page(persistent_gnt->page);
                        kfree(persistent_gnt);
                }
 
-               if (info->shadow[i].req.operation != BLKIF_OP_INDIRECT)
+               if (rinfo->shadow[i].req.operation != BLKIF_OP_INDIRECT)
                        /*
                         * If this is not an indirect operation don't try to
                         * free indirect segments
@@ -1145,42 +1293,59 @@ static void blkif_free(struct blkfront_info *info, int suspend)
                        goto free_shadow;
 
                for (j = 0; j < INDIRECT_GREFS(segs); j++) {
-                       persistent_gnt = info->shadow[i].indirect_grants[j];
+                       persistent_gnt = rinfo->shadow[i].indirect_grants[j];
                        gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
                        __free_page(persistent_gnt->page);
                        kfree(persistent_gnt);
                }
 
 free_shadow:
-               kfree(info->shadow[i].grants_used);
-               info->shadow[i].grants_used = NULL;
-               kfree(info->shadow[i].indirect_grants);
-               info->shadow[i].indirect_grants = NULL;
-               kfree(info->shadow[i].sg);
-               info->shadow[i].sg = NULL;
+               kfree(rinfo->shadow[i].grants_used);
+               rinfo->shadow[i].grants_used = NULL;
+               kfree(rinfo->shadow[i].indirect_grants);
+               rinfo->shadow[i].indirect_grants = NULL;
+               kfree(rinfo->shadow[i].sg);
+               rinfo->shadow[i].sg = NULL;
        }
 
        /* No more gnttab callback work. */
-       gnttab_cancel_free_callback(&info->callback);
-       spin_unlock_irq(&info->io_lock);
+       gnttab_cancel_free_callback(&rinfo->callback);
 
        /* Flush gnttab callback work. Must be done with no locks held. */
-       flush_work(&info->work);
+       flush_work(&rinfo->work);
 
        /* Free resources associated with old device channel. */
        for (i = 0; i < info->nr_ring_pages; i++) {
-               if (info->ring_ref[i] != GRANT_INVALID_REF) {
-                       gnttab_end_foreign_access(info->ring_ref[i], 0, 0);
-                       info->ring_ref[i] = GRANT_INVALID_REF;
+               if (rinfo->ring_ref[i] != GRANT_INVALID_REF) {
+                       gnttab_end_foreign_access(rinfo->ring_ref[i], 0, 0);
+                       rinfo->ring_ref[i] = GRANT_INVALID_REF;
                }
        }
-       free_pages((unsigned long)info->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE));
-       info->ring.sring = NULL;
+       free_pages((unsigned long)rinfo->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE));
+       rinfo->ring.sring = NULL;
 
-       if (info->irq)
-               unbind_from_irqhandler(info->irq, info);
-       info->evtchn = info->irq = 0;
+       if (rinfo->irq)
+               unbind_from_irqhandler(rinfo->irq, rinfo);
+       rinfo->evtchn = rinfo->irq = 0;
+}
 
+static void blkif_free(struct blkfront_info *info, int suspend)
+{
+       unsigned int i;
+
+       /* Prevent new requests being issued until we fix things up. */
+       info->connected = suspend ?
+               BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
+       /* No more blkif_request(). */
+       if (info->rq)
+               blk_mq_stop_hw_queues(info->rq);
+
+       for (i = 0; i < info->nr_rings; i++)
+               blkif_free_ring(&info->rinfo[i]);
+
+       kfree(info->rinfo);
+       info->rinfo = NULL;
+       info->nr_rings = 0;
 }
 
 struct copy_from_grant {
@@ -1209,19 +1374,93 @@ static void blkif_copy_from_grant(unsigned long gfn, unsigned int offset,
        kunmap_atomic(shared_data);
 }
 
-static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
+static enum blk_req_status blkif_rsp_to_req_status(int rsp)
+{
+       switch (rsp)
+       {
+       case BLKIF_RSP_OKAY:
+               return REQ_DONE;
+       case BLKIF_RSP_EOPNOTSUPP:
+               return REQ_EOPNOTSUPP;
+       case BLKIF_RSP_ERROR:
+               /* Fallthrough. */
+       default:
+               return REQ_ERROR;
+       }
+}
+
+/*
+ * Get the final status of the block request based on two ring response
+ */
+static int blkif_get_final_status(enum blk_req_status s1,
+                                 enum blk_req_status s2)
+{
+       BUG_ON(s1 == REQ_WAITING);
+       BUG_ON(s2 == REQ_WAITING);
+
+       if (s1 == REQ_ERROR || s2 == REQ_ERROR)
+               return BLKIF_RSP_ERROR;
+       else if (s1 == REQ_EOPNOTSUPP || s2 == REQ_EOPNOTSUPP)
+               return BLKIF_RSP_EOPNOTSUPP;
+       return BLKIF_RSP_OKAY;
+}
+
+static bool blkif_completion(unsigned long *id,
+                            struct blkfront_ring_info *rinfo,
                             struct blkif_response *bret)
 {
        int i = 0;
        struct scatterlist *sg;
        int num_sg, num_grant;
+       struct blkfront_info *info = rinfo->dev_info;
+       struct blk_shadow *s = &rinfo->shadow[*id];
        struct copy_from_grant data = {
-               .s = s,
                .grant_idx = 0,
        };
 
        num_grant = s->req.operation == BLKIF_OP_INDIRECT ?
                s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;
+
+       /* The I/O request may be split in two. */
+       if (unlikely(s->associated_id != NO_ASSOCIATED_ID)) {
+               struct blk_shadow *s2 = &rinfo->shadow[s->associated_id];
+
+               /* Keep the status of the current response in shadow. */
+               s->status = blkif_rsp_to_req_status(bret->status);
+
+               /* Wait the second response if not yet here. */
+               if (s2->status == REQ_WAITING)
+                       return 0;
+
+               bret->status = blkif_get_final_status(s->status,
+                                                     s2->status);
+
+               /*
+                * All the grants is stored in the first shadow in order
+                * to make the completion code simpler.
+                */
+               num_grant += s2->req.u.rw.nr_segments;
+
+               /*
+                * The two responses may not come in order. Only the
+                * first request will store the scatter-gather list.
+                */
+               if (s2->num_sg != 0) {
+                       /* Update "id" with the ID of the first response. */
+                       *id = s->associated_id;
+                       s = s2;
+               }
+
+               /*
+                * We don't need anymore the second request, so recycling
+                * it now.
+                */
+               if (add_id_to_freelist(rinfo, s->associated_id))
+                       WARN(1, "%s: can't recycle the second part (id = %ld) of the request\n",
+                            info->gd->disk_name, s->associated_id);
+       }
+
+       data.s = s;
        num_sg = s->num_sg;
 
        if (bret->operation == BLKIF_OP_READ && info->feature_persistent) {
@@ -1252,8 +1491,8 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
                        if (!info->feature_persistent)
                                pr_alert_ratelimited("backed has not unmapped grant: %u\n",
                                                     s->grants_used[i]->gref);
-                       list_add(&s->grants_used[i]->node, &info->grants);
-                       info->persistent_gnts_c++;
+                       list_add(&s->grants_used[i]->node, &rinfo->grants);
+                       rinfo->persistent_gnts_c++;
                } else {
                        /*
                         * If the grant is not mapped by the backend we end the
@@ -1263,7 +1502,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
                         */
                        gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL);
                        s->grants_used[i]->gref = GRANT_INVALID_REF;
-                       list_add_tail(&s->grants_used[i]->node, &info->grants);
+                       list_add_tail(&s->grants_used[i]->node, &rinfo->grants);
                }
        }
        if (s->req.operation == BLKIF_OP_INDIRECT) {
@@ -1272,8 +1511,8 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
                                if (!info->feature_persistent)
                                        pr_alert_ratelimited("backed has not unmapped grant: %u\n",
                                                             s->indirect_grants[i]->gref);
-                               list_add(&s->indirect_grants[i]->node, &info->grants);
-                               info->persistent_gnts_c++;
+                               list_add(&s->indirect_grants[i]->node, &rinfo->grants);
+                               rinfo->persistent_gnts_c++;
                        } else {
                                struct page *indirect_page;
 
@@ -1284,13 +1523,15 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
                                 */
                                if (!info->feature_persistent) {
                                        indirect_page = s->indirect_grants[i]->page;
-                                       list_add(&indirect_page->lru, &info->indirect_pages);
+                                       list_add(&indirect_page->lru, &rinfo->indirect_pages);
                                }
                                s->indirect_grants[i]->gref = GRANT_INVALID_REF;
-                               list_add_tail(&s->indirect_grants[i]->node, &info->grants);
+                               list_add_tail(&s->indirect_grants[i]->node, &rinfo->grants);
                        }
                }
        }
+
+       return 1;
 }
 
 static irqreturn_t blkif_interrupt(int irq, void *dev_id)
@@ -1299,24 +1540,22 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
        struct blkif_response *bret;
        RING_IDX i, rp;
        unsigned long flags;
-       struct blkfront_info *info = (struct blkfront_info *)dev_id;
+       struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)dev_id;
+       struct blkfront_info *info = rinfo->dev_info;
        int error;
 
-       spin_lock_irqsave(&info->io_lock, flags);
-
-       if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
-               spin_unlock_irqrestore(&info->io_lock, flags);
+       if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
                return IRQ_HANDLED;
-       }
 
+       spin_lock_irqsave(&rinfo->ring_lock, flags);
  again:
-       rp = info->ring.sring->rsp_prod;
+       rp = rinfo->ring.sring->rsp_prod;
        rmb(); /* Ensure we see queued responses up to 'rp'. */
 
-       for (i = info->ring.rsp_cons; i != rp; i++) {
+       for (i = rinfo->ring.rsp_cons; i != rp; i++) {
                unsigned long id;
 
-               bret = RING_GET_RESPONSE(&info->ring, i);
+               bret = RING_GET_RESPONSE(&rinfo->ring, i);
                id   = bret->id;
                /*
                 * The backend has messed up and given us an id that we would
@@ -1330,12 +1569,18 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
                         * the id is busted. */
                        continue;
                }
-               req  = info->shadow[id].request;
+               req  = rinfo->shadow[id].request;
 
-               if (bret->operation != BLKIF_OP_DISCARD)
-                       blkif_completion(&info->shadow[id], info, bret);
+               if (bret->operation != BLKIF_OP_DISCARD) {
+                       /*
+                        * We may need to wait for an extra response if the
+                        * I/O request is split in 2
+                        */
+                       if (!blkif_completion(&id, rinfo, bret))
+                               continue;
+               }
 
-               if (add_id_to_freelist(info, id)) {
+               if (add_id_to_freelist(rinfo, id)) {
                        WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n",
                             info->gd->disk_name, op_name(bret->operation), id);
                        continue;
@@ -1364,7 +1609,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
                                error = -EOPNOTSUPP;
                        }
                        if (unlikely(bret->status == BLKIF_RSP_ERROR &&
-                                    info->shadow[id].req.u.rw.nr_segments == 0)) {
+                                    rinfo->shadow[id].req.u.rw.nr_segments == 0)) {
                                printk(KERN_WARNING "blkfront: %s: empty %s op failed\n",
                                       info->gd->disk_name, op_name(bret->operation));
                                error = -EOPNOTSUPP;
@@ -1389,34 +1634,35 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
                }
        }
 
-       info->ring.rsp_cons = i;
+       rinfo->ring.rsp_cons = i;
 
-       if (i != info->ring.req_prod_pvt) {
+       if (i != rinfo->ring.req_prod_pvt) {
                int more_to_do;
-               RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
+               RING_FINAL_CHECK_FOR_RESPONSES(&rinfo->ring, more_to_do);
                if (more_to_do)
                        goto again;
        } else
-               info->ring.sring->rsp_event = i + 1;
+               rinfo->ring.sring->rsp_event = i + 1;
 
-       kick_pending_request_queues(info);
+       kick_pending_request_queues_locked(rinfo);
 
-       spin_unlock_irqrestore(&info->io_lock, flags);
+       spin_unlock_irqrestore(&rinfo->ring_lock, flags);
 
        return IRQ_HANDLED;
 }
 
 
 static int setup_blkring(struct xenbus_device *dev,
-                        struct blkfront_info *info)
+                        struct blkfront_ring_info *rinfo)
 {
        struct blkif_sring *sring;
        int err, i;
+       struct blkfront_info *info = rinfo->dev_info;
        unsigned long ring_size = info->nr_ring_pages * XEN_PAGE_SIZE;
        grant_ref_t gref[XENBUS_MAX_RING_GRANTS];
 
        for (i = 0; i < info->nr_ring_pages; i++)
-               info->ring_ref[i] = GRANT_INVALID_REF;
+               rinfo->ring_ref[i] = GRANT_INVALID_REF;
 
        sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH,
                                                       get_order(ring_size));
@@ -1425,29 +1671,29 @@ static int setup_blkring(struct xenbus_device *dev,
                return -ENOMEM;
        }
        SHARED_RING_INIT(sring);
-       FRONT_RING_INIT(&info->ring, sring, ring_size);
+       FRONT_RING_INIT(&rinfo->ring, sring, ring_size);
 
-       err = xenbus_grant_ring(dev, info->ring.sring, info->nr_ring_pages, gref);
+       err = xenbus_grant_ring(dev, rinfo->ring.sring, info->nr_ring_pages, gref);
        if (err < 0) {
                free_pages((unsigned long)sring, get_order(ring_size));
-               info->ring.sring = NULL;
+               rinfo->ring.sring = NULL;
                goto fail;
        }
        for (i = 0; i < info->nr_ring_pages; i++)
-               info->ring_ref[i] = gref[i];
+               rinfo->ring_ref[i] = gref[i];
 
-       err = xenbus_alloc_evtchn(dev, &info->evtchn);
+       err = xenbus_alloc_evtchn(dev, &rinfo->evtchn);
        if (err)
                goto fail;
 
-       err = bind_evtchn_to_irqhandler(info->evtchn, blkif_interrupt, 0,
-                                       "blkif", info);
+       err = bind_evtchn_to_irqhandler(rinfo->evtchn, blkif_interrupt, 0,
+                                       "blkif", rinfo);
        if (err <= 0) {
                xenbus_dev_fatal(dev, err,
                                 "bind_evtchn_to_irqhandler failed");
                goto fail;
        }
-       info->irq = err;
+       rinfo->irq = err;
 
        return 0;
 fail:
@@ -1455,6 +1701,53 @@ fail:
        return err;
 }
 
+/*
+ * Write out per-ring/queue nodes including ring-ref and event-channel, and each
+ * ring buffer may have multi pages depending on ->nr_ring_pages.
+ */
+static int write_per_ring_nodes(struct xenbus_transaction xbt,
+                               struct blkfront_ring_info *rinfo, const char *dir)
+{
+       int err;
+       unsigned int i;
+       const char *message = NULL;
+       struct blkfront_info *info = rinfo->dev_info;
+
+       if (info->nr_ring_pages == 1) {
+               err = xenbus_printf(xbt, dir, "ring-ref", "%u", rinfo->ring_ref[0]);
+               if (err) {
+                       message = "writing ring-ref";
+                       goto abort_transaction;
+               }
+       } else {
+               for (i = 0; i < info->nr_ring_pages; i++) {
+                       char ring_ref_name[RINGREF_NAME_LEN];
+
+                       snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
+                       err = xenbus_printf(xbt, dir, ring_ref_name,
+                                           "%u", rinfo->ring_ref[i]);
+                       if (err) {
+                               message = "writing ring-ref";
+                               goto abort_transaction;
+                       }
+               }
+       }
+
+       err = xenbus_printf(xbt, dir, "event-channel", "%u", rinfo->evtchn);
+       if (err) {
+               message = "writing event-channel";
+               goto abort_transaction;
+       }
+
+       return 0;
+
+abort_transaction:
+       xenbus_transaction_end(xbt, 1);
+       if (message)
+               xenbus_dev_fatal(info->xbdev, err, "%s", message);
+
+       return err;
+}
 
 /* Common code used when first setting up, and when resuming. */
 static int talk_to_blkback(struct xenbus_device *dev,
@@ -1462,8 +1755,8 @@ static int talk_to_blkback(struct xenbus_device *dev,
 {
        const char *message = NULL;
        struct xenbus_transaction xbt;
-       int err, i;
-       unsigned int max_page_order = 0;
+       int err;
+       unsigned int i, max_page_order = 0;
        unsigned int ring_page_order = 0;
 
        err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
@@ -1475,10 +1768,14 @@ static int talk_to_blkback(struct xenbus_device *dev,
                info->nr_ring_pages = 1 << ring_page_order;
        }
 
-       /* Create shared ring, alloc event channel. */
-       err = setup_blkring(dev, info);
-       if (err)
-               goto out;
+       for (i = 0; i < info->nr_rings; i++) {
+               struct blkfront_ring_info *rinfo = &info->rinfo[i];
+
+               /* Create shared ring, alloc event channel. */
+               err = setup_blkring(dev, rinfo);
+               if (err)
+                       goto destroy_blkring;
+       }
 
 again:
        err = xenbus_transaction_start(&xbt);
@@ -1487,38 +1784,49 @@ again:
                goto destroy_blkring;
        }
 
-       if (info->nr_ring_pages == 1) {
-               err = xenbus_printf(xbt, dev->nodename,
-                                   "ring-ref", "%u", info->ring_ref[0]);
+       if (info->nr_ring_pages > 1) {
+               err = xenbus_printf(xbt, dev->nodename, "ring-page-order", "%u",
+                                   ring_page_order);
                if (err) {
-                       message = "writing ring-ref";
+                       message = "writing ring-page-order";
                        goto abort_transaction;
                }
+       }
+
+       /* We already got the number of queues/rings in _probe */
+       if (info->nr_rings == 1) {
+               err = write_per_ring_nodes(xbt, &info->rinfo[0], dev->nodename);
+               if (err)
+                       goto destroy_blkring;
        } else {
-               err = xenbus_printf(xbt, dev->nodename,
-                                   "ring-page-order", "%u", ring_page_order);
+               char *path;
+               size_t pathsize;
+
+               err = xenbus_printf(xbt, dev->nodename, "multi-queue-num-queues", "%u",
+                                   info->nr_rings);
                if (err) {
-                       message = "writing ring-page-order";
+                       message = "writing multi-queue-num-queues";
                        goto abort_transaction;
                }
 
-               for (i = 0; i < info->nr_ring_pages; i++) {
-                       char ring_ref_name[RINGREF_NAME_LEN];
+               pathsize = strlen(dev->nodename) + QUEUE_NAME_LEN;
+               path = kmalloc(pathsize, GFP_KERNEL);
+               if (!path) {
+                       err = -ENOMEM;
+                       message = "ENOMEM while writing ring references";
+                       goto abort_transaction;
+               }
 
-                       snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
-                       err = xenbus_printf(xbt, dev->nodename, ring_ref_name,
-                                           "%u", info->ring_ref[i]);
+               for (i = 0; i < info->nr_rings; i++) {
+                       memset(path, 0, pathsize);
+                       snprintf(path, pathsize, "%s/queue-%u", dev->nodename, i);
+                       err = write_per_ring_nodes(xbt, &info->rinfo[i], path);
                        if (err) {
-                               message = "writing ring-ref";
-                               goto abort_transaction;
+                               kfree(path);
+                               goto destroy_blkring;
                        }
                }
-       }
-       err = xenbus_printf(xbt, dev->nodename,
-                           "event-channel", "%u", info->evtchn);
-       if (err) {
-               message = "writing event-channel";
-               goto abort_transaction;
+               kfree(path);
        }
        err = xenbus_printf(xbt, dev->nodename, "protocol", "%s",
                            XEN_IO_PROTO_ABI_NATIVE);
@@ -1540,9 +1848,14 @@ again:
                goto destroy_blkring;
        }
 
-       for (i = 0; i < BLK_RING_SIZE(info); i++)
-               info->shadow[i].req.u.rw.id = i+1;
-       info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
+       for (i = 0; i < info->nr_rings; i++) {
+               unsigned int j;
+               struct blkfront_ring_info *rinfo = &info->rinfo[i];
+
+               for (j = 0; j < BLK_RING_SIZE(info); j++)
+                       rinfo->shadow[j].req.u.rw.id = j + 1;
+               rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
+       }
        xenbus_switch_state(dev, XenbusStateInitialised);
 
        return 0;
@@ -1553,7 +1866,10 @@ again:
                xenbus_dev_fatal(dev, err, "%s", message);
  destroy_blkring:
        blkif_free(info, 0);
- out:
+
+       kfree(info);
+       dev_set_drvdata(&dev->dev, NULL);
+
        return err;
 }
 
@@ -1567,7 +1883,9 @@ static int blkfront_probe(struct xenbus_device *dev,
                          const struct xenbus_device_id *id)
 {
        int err, vdevice;
+       unsigned int r_index;
        struct blkfront_info *info;
+       unsigned int backend_max_queues = 0;
 
        /* FIXME: Use dynamic device id if this is not set. */
        err = xenbus_scanf(XBT_NIL, dev->nodename,
@@ -1617,15 +1935,39 @@ static int blkfront_probe(struct xenbus_device *dev,
                return -ENOMEM;
        }
 
-       mutex_init(&info->mutex);
-       spin_lock_init(&info->io_lock);
        info->xbdev = dev;
+       /* Check if backend supports multiple queues. */
+       err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+                          "multi-queue-max-queues", "%u", &backend_max_queues);
+       if (err < 0)
+               backend_max_queues = 1;
+
+       info->nr_rings = min(backend_max_queues, xen_blkif_max_queues);
+       /* We need at least one ring. */
+       if (!info->nr_rings)
+               info->nr_rings = 1;
+
+       info->rinfo = kzalloc(sizeof(struct blkfront_ring_info) * info->nr_rings, GFP_KERNEL);
+       if (!info->rinfo) {
+               xenbus_dev_fatal(dev, -ENOMEM, "allocating ring_info structure");
+               kfree(info);
+               return -ENOMEM;
+       }
+
+       for (r_index = 0; r_index < info->nr_rings; r_index++) {
+               struct blkfront_ring_info *rinfo;
+
+               rinfo = &info->rinfo[r_index];
+               INIT_LIST_HEAD(&rinfo->indirect_pages);
+               INIT_LIST_HEAD(&rinfo->grants);
+               rinfo->dev_info = info;
+               INIT_WORK(&rinfo->work, blkif_restart_queue);
+               spin_lock_init(&rinfo->ring_lock);
+       }
+
+       mutex_init(&info->mutex);
        info->vdevice = vdevice;
-       INIT_LIST_HEAD(&info->grants);
-       INIT_LIST_HEAD(&info->indirect_pages);
-       info->persistent_gnts_c = 0;
        info->connected = BLKIF_STATE_DISCONNECTED;
-       INIT_WORK(&info->work, blkif_restart_queue);
 
        /* Front end dir is a number, which is used as the id. */
        info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
@@ -1649,7 +1991,7 @@ static void split_bio_end(struct bio *bio)
 
 static int blkif_recover(struct blkfront_info *info)
 {
-       int i;
+       unsigned int i, r_index;
        struct request *req, *n;
        struct blk_shadow *copy;
        int rc;
@@ -1660,64 +2002,73 @@ static int blkif_recover(struct blkfront_info *info)
        struct split_bio *split_bio;
        struct list_head requests;
 
-       /* Stage 1: Make a safe copy of the shadow state. */
-       copy = kmemdup(info->shadow, sizeof(info->shadow),
-                      GFP_NOIO | __GFP_REPEAT | __GFP_HIGH);
-       if (!copy)
-               return -ENOMEM;
-
-       /* Stage 2: Set up free list. */
-       memset(&info->shadow, 0, sizeof(info->shadow));
-       for (i = 0; i < BLK_RING_SIZE(info); i++)
-               info->shadow[i].req.u.rw.id = i+1;
-       info->shadow_free = info->ring.req_prod_pvt;
-       info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
-
-       rc = blkfront_gather_backend_features(info);
-       if (rc) {
-               kfree(copy);
-               return rc;
-       }
-
+       blkfront_gather_backend_features(info);
        segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST;
        blk_queue_max_segments(info->rq, segs);
        bio_list_init(&bio_list);
        INIT_LIST_HEAD(&requests);
-       for (i = 0; i < BLK_RING_SIZE(info); i++) {
-               /* Not in use? */
-               if (!copy[i].request)
-                       continue;
 
-               /*
-                * Get the bios in the request so we can re-queue them.
-                */
-               if (copy[i].request->cmd_flags &
-                   (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
+       for (r_index = 0; r_index < info->nr_rings; r_index++) {
+               struct blkfront_ring_info *rinfo;
+
+               rinfo = &info->rinfo[r_index];
+               /* Stage 1: Make a safe copy of the shadow state. */
+               copy = kmemdup(rinfo->shadow, sizeof(rinfo->shadow),
+                              GFP_NOIO | __GFP_REPEAT | __GFP_HIGH);
+               if (!copy)
+                       return -ENOMEM;
+
+               /* Stage 2: Set up free list. */
+               memset(&rinfo->shadow, 0, sizeof(rinfo->shadow));
+               for (i = 0; i < BLK_RING_SIZE(info); i++)
+                       rinfo->shadow[i].req.u.rw.id = i+1;
+               rinfo->shadow_free = rinfo->ring.req_prod_pvt;
+               rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
+
+               rc = blkfront_setup_indirect(rinfo);
+               if (rc) {
+                       kfree(copy);
+                       return rc;
+               }
+
+               for (i = 0; i < BLK_RING_SIZE(info); i++) {
+                       /* Not in use? */
+                       if (!copy[i].request)
+                               continue;
+
                        /*
-                        * Flush operations don't contain bios, so
-                        * we need to requeue the whole request
+                        * Get the bios in the request so we can re-queue them.
                         */
-                       list_add(&copy[i].request->queuelist, &requests);
-                       continue;
+                       if (copy[i].request->cmd_flags &
+                           (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
+                               /*
+                                * Flush operations don't contain bios, so
+                                * we need to requeue the whole request
+                                */
+                               list_add(&copy[i].request->queuelist, &requests);
+                               continue;
+                       }
+                       merge_bio.head = copy[i].request->bio;
+                       merge_bio.tail = copy[i].request->biotail;
+                       bio_list_merge(&bio_list, &merge_bio);
+                       copy[i].request->bio = NULL;
+                       blk_end_request_all(copy[i].request, 0);
                }
-               merge_bio.head = copy[i].request->bio;
-               merge_bio.tail = copy[i].request->biotail;
-               bio_list_merge(&bio_list, &merge_bio);
-               copy[i].request->bio = NULL;
-               blk_end_request_all(copy[i].request, 0);
-       }
-
-       kfree(copy);
 
+               kfree(copy);
+       }
        xenbus_switch_state(info->xbdev, XenbusStateConnected);
 
-       spin_lock_irq(&info->io_lock);
-
        /* Now safe for us to use the shared ring */
        info->connected = BLKIF_STATE_CONNECTED;
 
-       /* Kick any other new requests queued since we resumed */
-       kick_pending_request_queues(info);
+       for (r_index = 0; r_index < info->nr_rings; r_index++) {
+               struct blkfront_ring_info *rinfo;
+
+               rinfo = &info->rinfo[r_index];
+               /* Kick any other new requests queued since we resumed */
+               kick_pending_request_queues(rinfo);
+       }
 
        list_for_each_entry_safe(req, n, &requests, queuelist) {
                /* Requeue pending requests (flush or discard) */
@@ -1725,7 +2076,6 @@ static int blkif_recover(struct blkfront_info *info)
                BUG_ON(req->nr_phys_segments > segs);
                blk_mq_requeue_request(req);
        }
-       spin_unlock_irq(&info->io_lock);
        blk_mq_kick_requeue_list(info->rq);
 
        while ((bio = bio_list_pop(&bio_list)) != NULL) {
@@ -1790,8 +2140,7 @@ static int blkfront_resume(struct xenbus_device *dev)
        return err;
 }
 
-static void
-blkfront_closing(struct blkfront_info *info)
+static void blkfront_closing(struct blkfront_info *info)
 {
        struct xenbus_device *xbdev = info->xbdev;
        struct block_device *bdev = NULL;
@@ -1851,18 +2200,29 @@ static void blkfront_setup_discard(struct blkfront_info *info)
                info->feature_secdiscard = !!discard_secure;
 }
 
-static int blkfront_setup_indirect(struct blkfront_info *info)
+static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo)
 {
        unsigned int psegs, grants;
        int err, i;
+       struct blkfront_info *info = rinfo->dev_info;
 
-       if (info->max_indirect_segments == 0)
-               grants = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+       if (info->max_indirect_segments == 0) {
+               if (!HAS_EXTRA_REQ)
+                       grants = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+               else {
+                       /*
+                        * When an extra req is required, the maximum
+                        * grants supported is related to the size of the
+                        * Linux block segment.
+                        */
+                       grants = GRANTS_PER_PSEG;
+               }
+       }
        else
                grants = info->max_indirect_segments;
        psegs = grants / GRANTS_PER_PSEG;
 
-       err = fill_grant_buffer(info,
+       err = fill_grant_buffer(rinfo,
                                (grants + INDIRECT_GREFS(grants)) * BLK_RING_SIZE(info));
        if (err)
                goto out_of_memory;
@@ -1875,31 +2235,31 @@ static int blkfront_setup_indirect(struct blkfront_info *info)
                 */
                int num = INDIRECT_GREFS(grants) * BLK_RING_SIZE(info);
 
-               BUG_ON(!list_empty(&info->indirect_pages));
+               BUG_ON(!list_empty(&rinfo->indirect_pages));
                for (i = 0; i < num; i++) {
                        struct page *indirect_page = alloc_page(GFP_NOIO);
                        if (!indirect_page)
                                goto out_of_memory;
-                       list_add(&indirect_page->lru, &info->indirect_pages);
+                       list_add(&indirect_page->lru, &rinfo->indirect_pages);
                }
        }
 
        for (i = 0; i < BLK_RING_SIZE(info); i++) {
-               info->shadow[i].grants_used = kzalloc(
-                       sizeof(info->shadow[i].grants_used[0]) * grants,
+               rinfo->shadow[i].grants_used = kzalloc(
+                       sizeof(rinfo->shadow[i].grants_used[0]) * grants,
                        GFP_NOIO);
-               info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * psegs, GFP_NOIO);
+               rinfo->shadow[i].sg = kzalloc(sizeof(rinfo->shadow[i].sg[0]) * psegs, GFP_NOIO);
                if (info->max_indirect_segments)
-                       info->shadow[i].indirect_grants = kzalloc(
-                               sizeof(info->shadow[i].indirect_grants[0]) *
+                       rinfo->shadow[i].indirect_grants = kzalloc(
+                               sizeof(rinfo->shadow[i].indirect_grants[0]) *
                                INDIRECT_GREFS(grants),
                                GFP_NOIO);
-               if ((info->shadow[i].grants_used == NULL) ||
-                       (info->shadow[i].sg == NULL) ||
+               if ((rinfo->shadow[i].grants_used == NULL) ||
+                       (rinfo->shadow[i].sg == NULL) ||
                     (info->max_indirect_segments &&
-                    (info->shadow[i].indirect_grants == NULL)))
+                    (rinfo->shadow[i].indirect_grants == NULL)))
                        goto out_of_memory;
-               sg_init_table(info->shadow[i].sg, psegs);
+               sg_init_table(rinfo->shadow[i].sg, psegs);
        }
 
 
@@ -1907,16 +2267,16 @@ static int blkfront_setup_indirect(struct blkfront_info *info)
 
 out_of_memory:
        for (i = 0; i < BLK_RING_SIZE(info); i++) {
-               kfree(info->shadow[i].grants_used);
-               info->shadow[i].grants_used = NULL;
-               kfree(info->shadow[i].sg);
-               info->shadow[i].sg = NULL;
-               kfree(info->shadow[i].indirect_grants);
-               info->shadow[i].indirect_grants = NULL;
-       }
-       if (!list_empty(&info->indirect_pages)) {
+               kfree(rinfo->shadow[i].grants_used);
+               rinfo->shadow[i].grants_used = NULL;
+               kfree(rinfo->shadow[i].sg);
+               rinfo->shadow[i].sg = NULL;
+               kfree(rinfo->shadow[i].indirect_grants);
+               rinfo->shadow[i].indirect_grants = NULL;
+       }
+       if (!list_empty(&rinfo->indirect_pages)) {
                struct page *indirect_page, *n;
-               list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) {
+               list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) {
                        list_del(&indirect_page->lru);
                        __free_page(indirect_page);
                }
@@ -1927,7 +2287,7 @@ out_of_memory:
 /*
  * Gather all backend feature-*
  */
-static int blkfront_gather_backend_features(struct blkfront_info *info)
+static void blkfront_gather_backend_features(struct blkfront_info *info)
 {
        int err;
        int barrier, flush, discard, persistent;
@@ -1982,8 +2342,6 @@ static int blkfront_gather_backend_features(struct blkfront_info *info)
        else
                info->max_indirect_segments = min(indirect_segments,
                                                  xen_blkif_max_segments);
-
-       return blkfront_setup_indirect(info);
 }
 
 /*
@@ -1996,7 +2354,7 @@ static void blkfront_connect(struct blkfront_info *info)
        unsigned long sector_size;
        unsigned int physical_sector_size;
        unsigned int binfo;
-       int err;
+       int err, i;
 
        switch (info->connected) {
        case BLKIF_STATE_CONNECTED:
@@ -2053,11 +2411,15 @@ static void blkfront_connect(struct blkfront_info *info)
        if (err != 1)
                physical_sector_size = sector_size;
 
-       err = blkfront_gather_backend_features(info);
-       if (err) {
-               xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s",
-                                info->xbdev->otherend);
-               return;
+       blkfront_gather_backend_features(info);
+       for (i = 0; i < info->nr_rings; i++) {
+               err = blkfront_setup_indirect(&info->rinfo[i]);
+               if (err) {
+                       xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s",
+                                        info->xbdev->otherend);
+                       blkif_free(info, 0);
+                       break;
+               }
        }
 
        err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size,
@@ -2071,10 +2433,9 @@ static void blkfront_connect(struct blkfront_info *info)
        xenbus_switch_state(info->xbdev, XenbusStateConnected);
 
        /* Kick pending requests. */
-       spin_lock_irq(&info->io_lock);
        info->connected = BLKIF_STATE_CONNECTED;
-       kick_pending_request_queues(info);
-       spin_unlock_irq(&info->io_lock);
+       for (i = 0; i < info->nr_rings; i++)
+               kick_pending_request_queues(&info->rinfo[i]);
 
        add_disk(info->gd);
 
@@ -2095,11 +2456,8 @@ static void blkback_changed(struct xenbus_device *dev,
        case XenbusStateInitWait:
                if (dev->state != XenbusStateInitialising)
                        break;
-               if (talk_to_blkback(dev, info)) {
-                       kfree(info);
-                       dev_set_drvdata(&dev->dev, NULL);
+               if (talk_to_blkback(dev, info))
                        break;
-               }
        case XenbusStateInitialising:
        case XenbusStateInitialised:
        case XenbusStateReconfiguring:
@@ -2108,6 +2466,10 @@ static void blkback_changed(struct xenbus_device *dev,
                break;
 
        case XenbusStateConnected:
+               if (dev->state != XenbusStateInitialised) {
+                       if (talk_to_blkback(dev, info))
+                               break;
+               }
                blkfront_connect(info);
                break;
 
@@ -2281,6 +2643,7 @@ static struct xenbus_driver blkfront_driver = {
 static int __init xlblk_init(void)
 {
        int ret;
+       int nr_cpus = num_online_cpus();
 
        if (!xen_domain())
                return -ENODEV;
@@ -2288,7 +2651,13 @@ static int __init xlblk_init(void)
        if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) {
                pr_info("Invalid max_ring_order (%d), will use default max: %d.\n",
                        xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER);
-               xen_blkif_max_ring_order = 0;
+               xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
+       }
+
+       if (xen_blkif_max_queues > nr_cpus) {
+               pr_info("Invalid max_queues (%d), will use default max: %d.\n",
+                       xen_blkif_max_queues, nr_cpus);
+               xen_blkif_max_queues = nr_cpus;
        }
 
        if (!xen_has_pv_disk_devices())
index 6b1721f978c2945f4af716636c8373fe8807dca0..4f6f94c43412c0773e176cf8d67acd73fb6ca274 100644 (file)
@@ -689,7 +689,7 @@ static loff_t memory_lseek(struct file *file, loff_t offset, int orig)
 {
        loff_t ret;
 
-       mutex_lock(&file_inode(file)->i_mutex);
+       inode_lock(file_inode(file));
        switch (orig) {
        case SEEK_CUR:
                offset += file->f_pos;
@@ -706,7 +706,7 @@ static loff_t memory_lseek(struct file *file, loff_t offset, int orig)
        default:
                ret = -EINVAL;
        }
-       mutex_unlock(&file_inode(file)->i_mutex);
+       inode_unlock(file_inode(file));
        return ret;
 }
 
index f1d7fa45c2759b0ed97d9a54669d705f2547040c..f3f92d5fcda05b2a3692d9f100c7caa48445e5e1 100644 (file)
@@ -93,14 +93,11 @@ struct vma_data {
        spinlock_t lock;        /* Serialize access to this structure. */
        int count;              /* Number of pages allocated. */
        enum mspec_page_type type; /* Type of pages allocated. */
-       int flags;              /* See VMD_xxx below. */
        unsigned long vm_start; /* Original (unsplit) base. */
        unsigned long vm_end;   /* Original (unsplit) end. */
        unsigned long maddr[0]; /* Array of MSPEC addresses. */
 };
 
-#define VMD_VMALLOCED 0x1      /* vmalloc'd rather than kmalloc'd */
-
 /* used on shub2 to clear FOP cache in the HUB */
 static unsigned long scratch_page[MAX_NUMNODES];
 #define SH2_AMO_CACHE_ENTRIES  4
@@ -185,10 +182,7 @@ mspec_close(struct vm_area_struct *vma)
                               "failed to zero page %ld\n", my_page);
        }
 
-       if (vdata->flags & VMD_VMALLOCED)
-               vfree(vdata);
-       else
-               kfree(vdata);
+       kvfree(vdata);
 }
 
 /*
@@ -256,7 +250,7 @@ mspec_mmap(struct file *file, struct vm_area_struct *vma,
                                        enum mspec_page_type type)
 {
        struct vma_data *vdata;
-       int pages, vdata_size, flags = 0;
+       int pages, vdata_size;
 
        if (vma->vm_pgoff != 0)
                return -EINVAL;
@@ -271,16 +265,13 @@ mspec_mmap(struct file *file, struct vm_area_struct *vma,
        vdata_size = sizeof(struct vma_data) + pages * sizeof(long);
        if (vdata_size <= PAGE_SIZE)
                vdata = kzalloc(vdata_size, GFP_KERNEL);
-       else {
+       else
                vdata = vzalloc(vdata_size);
-               flags = VMD_VMALLOCED;
-       }
        if (!vdata)
                return -ENOMEM;
 
        vdata->vm_start = vma->vm_start;
        vdata->vm_end = vma->vm_end;
-       vdata->flags = flags;
        vdata->type = type;
        spin_lock_init(&vdata->lock);
        atomic_set(&vdata->refcnt, 1);
index 0b311fa277ef097f485d9aa92da18c262a7f10dd..b526dc15c27113d70d67bf23464c4a2e1c8c4418 100644 (file)
@@ -290,9 +290,9 @@ static int ps3flash_fsync(struct file *file, loff_t start, loff_t end, int datas
 {
        struct inode *inode = file_inode(file);
        int err;
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        err = ps3flash_writeback(ps3flash_dev);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return err;
 }
 
index 3dd69df9c970fa0e86417d6f4e0a147fe8f65885..07d494276aad04195c9e7247089649b302d3082c 100644 (file)
@@ -381,6 +381,7 @@ config CRYPTO_DEV_BFIN_CRC
 
 config CRYPTO_DEV_ATMEL_AES
        tristate "Support for Atmel AES hw accelerator"
+       depends on HAS_DMA
        depends on AT_XDMAC || AT_HDMAC || COMPILE_TEST
        select CRYPTO_AES
        select CRYPTO_AEAD
index 5621612ee92169da0e7f934a6c45a4ee29690f97..6dd3317ca3653947c00ee4790ca18417f3431d79 100644 (file)
@@ -280,6 +280,7 @@ static const char *atmel_aes_reg_name(u32 offset, char *tmp, size_t sz)
        case AES_GCMHR(2):
        case AES_GCMHR(3):
                snprintf(tmp, sz, "GCMHR[%u]", (offset - AES_GCMHR(0)) >> 2);
+               break;
 
        default:
                snprintf(tmp, sz, "0x%02x", offset);
index 0ac0ba86761110db09b9ec68c6c213d6529784eb..1e480f140663530a699d8bf51b402bce394a935e 100644 (file)
@@ -389,7 +389,7 @@ static int qat_hal_check_ae_alive(struct icp_qat_fw_loader_handle *handle)
 {
        unsigned int base_cnt, cur_cnt;
        unsigned char ae;
-       unsigned int times = MAX_RETRY_TIMES;
+       int times = MAX_RETRY_TIMES;
 
        for (ae = 0; ae < handle->hal_handle->ae_max_num; ae++) {
                qat_hal_rd_ae_csr(handle, ae, PROFILE_COUNT,
@@ -402,7 +402,7 @@ static int qat_hal_check_ae_alive(struct icp_qat_fw_loader_handle *handle)
                        cur_cnt &= 0xffff;
                } while (times-- && (cur_cnt == base_cnt));
 
-               if (!times) {
+               if (times < 0) {
                        pr_err("QAT: AE%d is inactive!!\n", ae);
                        return -EFAULT;
                }
@@ -453,7 +453,11 @@ static int qat_hal_init_esram(struct icp_qat_fw_loader_handle *handle)
        void __iomem *csr_addr =
                        (void __iomem *)((uintptr_t)handle->hal_ep_csr_addr_v +
                        ESRAM_AUTO_INIT_CSR_OFFSET);
-       unsigned int csr_val, times = 30;
+       unsigned int csr_val;
+       int times = 30;
+
+       if (handle->pci_dev->device == ADF_C3XXX_PCI_DEVICE_ID)
+               return 0;
 
        csr_val = ADF_CSR_RD(csr_addr, 0);
        if ((csr_val & ESRAM_AUTO_TINIT) && (csr_val & ESRAM_AUTO_TINIT_DONE))
@@ -467,7 +471,7 @@ static int qat_hal_init_esram(struct icp_qat_fw_loader_handle *handle)
                qat_hal_wait_cycles(handle, 0, ESRAM_AUTO_INIT_USED_CYCLES, 0);
                csr_val = ADF_CSR_RD(csr_addr, 0);
        } while (!(csr_val & ESRAM_AUTO_TINIT_DONE) && times--);
-       if ((!times)) {
+       if ((times < 0)) {
                pr_err("QAT: Fail to init eSram!\n");
                return -EFAULT;
        }
@@ -658,7 +662,7 @@ static int qat_hal_clear_gpr(struct icp_qat_fw_loader_handle *handle)
                        ret = qat_hal_wait_cycles(handle, ae, 20, 1);
                } while (ret && times--);
 
-               if (!times) {
+               if (times < 0) {
                        pr_err("QAT: clear GPR of AE %d failed", ae);
                        return -EINVAL;
                }
@@ -693,14 +697,12 @@ int qat_hal_init(struct adf_accel_dev *accel_dev)
        struct adf_hw_device_data *hw_data = accel_dev->hw_device;
        struct adf_bar *misc_bar =
                        &pci_info->pci_bars[hw_data->get_misc_bar_id(hw_data)];
-       struct adf_bar *sram_bar =
-                       &pci_info->pci_bars[hw_data->get_sram_bar_id(hw_data)];
+       struct adf_bar *sram_bar;
 
        handle = kzalloc(sizeof(*handle), GFP_KERNEL);
        if (!handle)
                return -ENOMEM;
 
-       handle->hal_sram_addr_v = sram_bar->virt_addr;
        handle->hal_cap_g_ctl_csr_addr_v =
                (void __iomem *)((uintptr_t)misc_bar->virt_addr +
                                 ICP_QAT_CAP_OFFSET);
@@ -714,6 +716,11 @@ int qat_hal_init(struct adf_accel_dev *accel_dev)
                (void __iomem *)((uintptr_t)handle->hal_cap_ae_xfer_csr_addr_v +
                                 LOCAL_TO_XFER_REG_OFFSET);
        handle->pci_dev = pci_info->pci_dev;
+       if (handle->pci_dev->device != ADF_C3XXX_PCI_DEVICE_ID) {
+               sram_bar =
+                       &pci_info->pci_bars[hw_data->get_sram_bar_id(hw_data)];
+               handle->hal_sram_addr_v = sram_bar->virt_addr;
+       }
        handle->fw_auth = (handle->pci_dev->device ==
                           ADF_DH895XCC_PCI_DEVICE_ID) ? false : true;
        handle->hal_handle = kzalloc(sizeof(*handle->hal_handle), GFP_KERNEL);
index c3b80fd65d6254e89caf3381529f673764286115..7b30b307674ba6d1b71065a6923758ea251d05b2 100644 (file)
@@ -198,10 +198,7 @@ EXPORT_SYMBOL(drm_ht_remove_item);
 void drm_ht_remove(struct drm_open_hash *ht)
 {
        if (ht->table) {
-               if ((PAGE_SIZE / sizeof(*ht->table)) >> ht->order)
-                       kfree(ht->table);
-               else
-                       vfree(ht->table);
+               kvfree(ht->table);
                ht->table = NULL;
        }
 }
index aa26f3c3416bbbcd04dac1ec5381fcb081105d5e..8a8440c0eed1bf64519c5b7bce7903d010246c1c 100644 (file)
@@ -5,6 +5,7 @@ menuconfig INFINIBAND
        depends on NET
        depends on INET
        depends on m || IPV6 != m
+       select IRQ_POLL
        ---help---
          Core support for InfiniBand (IB).  Make sure to also select
          any protocols you wish to use as well as drivers for your
@@ -54,6 +55,15 @@ config INFINIBAND_ADDR_TRANS
        depends on INFINIBAND
        default y
 
+config INFINIBAND_ADDR_TRANS_CONFIGFS
+       bool
+       depends on INFINIBAND_ADDR_TRANS && CONFIGFS_FS && !(INFINIBAND=y && CONFIGFS_FS=m)
+       default y
+       ---help---
+         ConfigFS support for RDMA communication manager (CM).
+         This allows the user to config the default GID type that the CM
+         uses for each device, when initiaing new connections.
+
 source "drivers/infiniband/hw/mthca/Kconfig"
 source "drivers/infiniband/hw/qib/Kconfig"
 source "drivers/infiniband/hw/cxgb3/Kconfig"
index d43a8994ac5c129d201b0f164442c618192eea9d..f818538a7f4e118b309052c3d43a1aa18d5dbe65 100644 (file)
@@ -8,7 +8,7 @@ obj-$(CONFIG_INFINIBAND_USER_MAD) +=    ib_umad.o
 obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=        ib_uverbs.o ib_ucm.o \
                                        $(user_access-y)
 
-ib_core-y :=                   packer.o ud_header.o verbs.o sysfs.o \
+ib_core-y :=                   packer.o ud_header.o verbs.o cq.o sysfs.o \
                                device.o fmr_pool.o cache.o netlink.o \
                                roce_gid_mgmt.o
 ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
@@ -24,6 +24,8 @@ iw_cm-y :=                    iwcm.o iwpm_util.o iwpm_msg.o
 
 rdma_cm-y :=                   cma.o
 
+rdma_cm-$(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS) += cma_configfs.o
+
 rdma_ucm-y :=                  ucma.o
 
 ib_addr-y :=                   addr.o
index 34b1adad07aacf92e9bbfa9621e084f2ec756f74..337353d86cfadec33064e10117539769fcc0f95d 100644 (file)
@@ -121,7 +121,8 @@ int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev,
 }
 EXPORT_SYMBOL(rdma_copy_addr);
 
-int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr,
+int rdma_translate_ip(const struct sockaddr *addr,
+                     struct rdma_dev_addr *dev_addr,
                      u16 *vlan_id)
 {
        struct net_device *dev;
@@ -139,7 +140,7 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr,
        switch (addr->sa_family) {
        case AF_INET:
                dev = ip_dev_find(dev_addr->net,
-                       ((struct sockaddr_in *) addr)->sin_addr.s_addr);
+                       ((const struct sockaddr_in *)addr)->sin_addr.s_addr);
 
                if (!dev)
                        return ret;
@@ -154,7 +155,7 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr,
                rcu_read_lock();
                for_each_netdev_rcu(dev_addr->net, dev) {
                        if (ipv6_chk_addr(dev_addr->net,
-                                         &((struct sockaddr_in6 *) addr)->sin6_addr,
+                                         &((const struct sockaddr_in6 *)addr)->sin6_addr,
                                          dev, 1)) {
                                ret = rdma_copy_addr(dev_addr, dev, NULL);
                                if (vlan_id)
@@ -198,7 +199,8 @@ static void queue_req(struct addr_req *req)
        mutex_unlock(&lock);
 }
 
-static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, void *daddr)
+static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr,
+                       const void *daddr)
 {
        struct neighbour *n;
        int ret;
@@ -222,8 +224,9 @@ static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, v
 }
 
 static int addr4_resolve(struct sockaddr_in *src_in,
-                        struct sockaddr_in *dst_in,
-                        struct rdma_dev_addr *addr)
+                        const struct sockaddr_in *dst_in,
+                        struct rdma_dev_addr *addr,
+                        struct rtable **prt)
 {
        __be32 src_ip = src_in->sin_addr.s_addr;
        __be32 dst_ip = dst_in->sin_addr.s_addr;
@@ -243,33 +246,29 @@ static int addr4_resolve(struct sockaddr_in *src_in,
        src_in->sin_family = AF_INET;
        src_in->sin_addr.s_addr = fl4.saddr;
 
-       if (rt->dst.dev->flags & IFF_LOOPBACK) {
-               ret = rdma_translate_ip((struct sockaddr *)dst_in, addr, NULL);
-               if (!ret)
-                       memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN);
-               goto put;
-       }
+       /* If there's a gateway, we're definitely in RoCE v2 (as RoCE v1 isn't
+        * routable) and we could set the network type accordingly.
+        */
+       if (rt->rt_uses_gateway)
+               addr->network = RDMA_NETWORK_IPV4;
 
-       /* If the device does ARP internally, return 'done' */
-       if (rt->dst.dev->flags & IFF_NOARP) {
-               ret = rdma_copy_addr(addr, rt->dst.dev, NULL);
-               goto put;
-       }
+       addr->hoplimit = ip4_dst_hoplimit(&rt->dst);
 
-       ret = dst_fetch_ha(&rt->dst, addr, &fl4.daddr);
-put:
-       ip_rt_put(rt);
+       *prt = rt;
+       return 0;
 out:
        return ret;
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
 static int addr6_resolve(struct sockaddr_in6 *src_in,
-                        struct sockaddr_in6 *dst_in,
-                        struct rdma_dev_addr *addr)
+                        const struct sockaddr_in6 *dst_in,
+                        struct rdma_dev_addr *addr,
+                        struct dst_entry **pdst)
 {
        struct flowi6 fl6;
        struct dst_entry *dst;
+       struct rt6_info *rt;
        int ret;
 
        memset(&fl6, 0, sizeof fl6);
@@ -281,6 +280,7 @@ static int addr6_resolve(struct sockaddr_in6 *src_in,
        if ((ret = dst->error))
                goto put;
 
+       rt = (struct rt6_info *)dst;
        if (ipv6_addr_any(&fl6.saddr)) {
                ret = ipv6_dev_get_saddr(addr->net, ip6_dst_idev(dst)->dev,
                                         &fl6.daddr, 0, &fl6.saddr);
@@ -291,43 +291,111 @@ static int addr6_resolve(struct sockaddr_in6 *src_in,
                src_in->sin6_addr = fl6.saddr;
        }
 
-       if (dst->dev->flags & IFF_LOOPBACK) {
-               ret = rdma_translate_ip((struct sockaddr *)dst_in, addr, NULL);
-               if (!ret)
-                       memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN);
-               goto put;
-       }
+       /* If there's a gateway, we're definitely in RoCE v2 (as RoCE v1 isn't
+        * routable) and we could set the network type accordingly.
+        */
+       if (rt->rt6i_flags & RTF_GATEWAY)
+               addr->network = RDMA_NETWORK_IPV6;
 
-       /* If the device does ARP internally, return 'done' */
-       if (dst->dev->flags & IFF_NOARP) {
-               ret = rdma_copy_addr(addr, dst->dev, NULL);
-               goto put;
-       }
+       addr->hoplimit = ip6_dst_hoplimit(dst);
 
-       ret = dst_fetch_ha(dst, addr, &fl6.daddr);
+       *pdst = dst;
+       return 0;
 put:
        dst_release(dst);
        return ret;
 }
 #else
 static int addr6_resolve(struct sockaddr_in6 *src_in,
-                        struct sockaddr_in6 *dst_in,
-                        struct rdma_dev_addr *addr)
+                        const struct sockaddr_in6 *dst_in,
+                        struct rdma_dev_addr *addr,
+                        struct dst_entry **pdst)
 {
        return -EADDRNOTAVAIL;
 }
 #endif
 
+static int addr_resolve_neigh(struct dst_entry *dst,
+                             const struct sockaddr *dst_in,
+                             struct rdma_dev_addr *addr)
+{
+       if (dst->dev->flags & IFF_LOOPBACK) {
+               int ret;
+
+               ret = rdma_translate_ip(dst_in, addr, NULL);
+               if (!ret)
+                       memcpy(addr->dst_dev_addr, addr->src_dev_addr,
+                              MAX_ADDR_LEN);
+
+               return ret;
+       }
+
+       /* If the device doesn't do ARP internally */
+       if (!(dst->dev->flags & IFF_NOARP)) {
+               const struct sockaddr_in *dst_in4 =
+                       (const struct sockaddr_in *)dst_in;
+               const struct sockaddr_in6 *dst_in6 =
+                       (const struct sockaddr_in6 *)dst_in;
+
+               return dst_fetch_ha(dst, addr,
+                                   dst_in->sa_family == AF_INET ?
+                                   (const void *)&dst_in4->sin_addr.s_addr :
+                                   (const void *)&dst_in6->sin6_addr);
+       }
+
+       return rdma_copy_addr(addr, dst->dev, NULL);
+}
+
 static int addr_resolve(struct sockaddr *src_in,
-                       struct sockaddr *dst_in,
-                       struct rdma_dev_addr *addr)
+                       const struct sockaddr *dst_in,
+                       struct rdma_dev_addr *addr,
+                       bool resolve_neigh)
 {
+       struct net_device *ndev;
+       struct dst_entry *dst;
+       int ret;
+
        if (src_in->sa_family == AF_INET) {
-               return addr4_resolve((struct sockaddr_in *) src_in,
-                       (struct sockaddr_in *) dst_in, addr);
-       } else
-               return addr6_resolve((struct sockaddr_in6 *) src_in,
-                       (struct sockaddr_in6 *) dst_in, addr);
+               struct rtable *rt = NULL;
+               const struct sockaddr_in *dst_in4 =
+                       (const struct sockaddr_in *)dst_in;
+
+               ret = addr4_resolve((struct sockaddr_in *)src_in,
+                                   dst_in4, addr, &rt);
+               if (ret)
+                       return ret;
+
+               if (resolve_neigh)
+                       ret = addr_resolve_neigh(&rt->dst, dst_in, addr);
+
+               ndev = rt->dst.dev;
+               dev_hold(ndev);
+
+               ip_rt_put(rt);
+       } else {
+               const struct sockaddr_in6 *dst_in6 =
+                       (const struct sockaddr_in6 *)dst_in;
+
+               ret = addr6_resolve((struct sockaddr_in6 *)src_in,
+                                   dst_in6, addr,
+                                   &dst);
+               if (ret)
+                       return ret;
+
+               if (resolve_neigh)
+                       ret = addr_resolve_neigh(dst, dst_in, addr);
+
+               ndev = dst->dev;
+               dev_hold(ndev);
+
+               dst_release(dst);
+       }
+
+       addr->bound_dev_if = ndev->ifindex;
+       addr->net = dev_net(ndev);
+       dev_put(ndev);
+
+       return ret;
 }
 
 static void process_req(struct work_struct *work)
@@ -343,7 +411,8 @@ static void process_req(struct work_struct *work)
                if (req->status == -ENODATA) {
                        src_in = (struct sockaddr *) &req->src_addr;
                        dst_in = (struct sockaddr *) &req->dst_addr;
-                       req->status = addr_resolve(src_in, dst_in, req->addr);
+                       req->status = addr_resolve(src_in, dst_in, req->addr,
+                                                  true);
                        if (req->status && time_after_eq(jiffies, req->timeout))
                                req->status = -ETIMEDOUT;
                        else if (req->status == -ENODATA)
@@ -403,7 +472,7 @@ int rdma_resolve_ip(struct rdma_addr_client *client,
        req->client = client;
        atomic_inc(&client->refcount);
 
-       req->status = addr_resolve(src_in, dst_in, addr);
+       req->status = addr_resolve(src_in, dst_in, addr, true);
        switch (req->status) {
        case 0:
                req->timeout = jiffies;
@@ -425,6 +494,26 @@ err:
 }
 EXPORT_SYMBOL(rdma_resolve_ip);
 
+int rdma_resolve_ip_route(struct sockaddr *src_addr,
+                         const struct sockaddr *dst_addr,
+                         struct rdma_dev_addr *addr)
+{
+       struct sockaddr_storage ssrc_addr = {};
+       struct sockaddr *src_in = (struct sockaddr *)&ssrc_addr;
+
+       if (src_addr) {
+               if (src_addr->sa_family != dst_addr->sa_family)
+                       return -EINVAL;
+
+               memcpy(src_in, src_addr, rdma_addr_size(src_addr));
+       } else {
+               src_in->sa_family = dst_addr->sa_family;
+       }
+
+       return addr_resolve(src_in, dst_addr, addr, false);
+}
+EXPORT_SYMBOL(rdma_resolve_ip_route);
+
 void rdma_addr_cancel(struct rdma_dev_addr *addr)
 {
        struct addr_req *req, *temp_req;
@@ -456,8 +545,10 @@ static void resolve_cb(int status, struct sockaddr *src_addr,
        complete(&((struct resolve_cb_context *)context)->comp);
 }
 
-int rdma_addr_find_dmac_by_grh(const union ib_gid *sgid, const union ib_gid *dgid,
-                              u8 *dmac, u16 *vlan_id, int if_index)
+int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid,
+                                const union ib_gid *dgid,
+                                u8 *dmac, u16 *vlan_id, int *if_index,
+                                int *hoplimit)
 {
        int ret = 0;
        struct rdma_dev_addr dev_addr;
@@ -475,7 +566,8 @@ int rdma_addr_find_dmac_by_grh(const union ib_gid *sgid, const union ib_gid *dgi
        rdma_gid2ip(&dgid_addr._sockaddr, dgid);
 
        memset(&dev_addr, 0, sizeof(dev_addr));
-       dev_addr.bound_dev_if = if_index;
+       if (if_index)
+               dev_addr.bound_dev_if = *if_index;
        dev_addr.net = &init_net;
 
        ctx.addr = &dev_addr;
@@ -491,12 +583,16 @@ int rdma_addr_find_dmac_by_grh(const union ib_gid *sgid, const union ib_gid *dgi
        dev = dev_get_by_index(&init_net, dev_addr.bound_dev_if);
        if (!dev)
                return -ENODEV;
+       if (if_index)
+               *if_index = dev_addr.bound_dev_if;
        if (vlan_id)
                *vlan_id = rdma_vlan_dev_vlan_id(dev);
+       if (hoplimit)
+               *hoplimit = dev_addr.hoplimit;
        dev_put(dev);
        return ret;
 }
-EXPORT_SYMBOL(rdma_addr_find_dmac_by_grh);
+EXPORT_SYMBOL(rdma_addr_find_l2_eth_by_grh);
 
 int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id)
 {
index 89bebeada38b9b5f6a09d9dd896c8047fa57d7ad..53343ffbff7a1302b03ce5f6a66116b06f1e60b5 100644 (file)
@@ -64,6 +64,7 @@ enum gid_attr_find_mask {
        GID_ATTR_FIND_MASK_GID          = 1UL << 0,
        GID_ATTR_FIND_MASK_NETDEV       = 1UL << 1,
        GID_ATTR_FIND_MASK_DEFAULT      = 1UL << 2,
+       GID_ATTR_FIND_MASK_GID_TYPE     = 1UL << 3,
 };
 
 enum gid_table_entry_props {
@@ -81,10 +82,6 @@ enum gid_table_write_action {
 };
 
 struct ib_gid_table_entry {
-       /* This lock protects an entry from being
-        * read and written simultaneously.
-        */
-       rwlock_t            lock;
        unsigned long       props;
        union ib_gid        gid;
        struct ib_gid_attr  attr;
@@ -109,28 +106,86 @@ struct ib_gid_table {
         * are locked by this lock.
         **/
        struct mutex         lock;
+       /* This lock protects the table entries from being
+        * read and written simultaneously.
+        */
+       rwlock_t             rwlock;
        struct ib_gid_table_entry *data_vec;
 };
 
+static void dispatch_gid_change_event(struct ib_device *ib_dev, u8 port)
+{
+       if (rdma_cap_roce_gid_table(ib_dev, port)) {
+               struct ib_event event;
+
+               event.device            = ib_dev;
+               event.element.port_num  = port;
+               event.event             = IB_EVENT_GID_CHANGE;
+
+               ib_dispatch_event(&event);
+       }
+}
+
+static const char * const gid_type_str[] = {
+       [IB_GID_TYPE_IB]        = "IB/RoCE v1",
+       [IB_GID_TYPE_ROCE_UDP_ENCAP]    = "RoCE v2",
+};
+
+const char *ib_cache_gid_type_str(enum ib_gid_type gid_type)
+{
+       if (gid_type < ARRAY_SIZE(gid_type_str) && gid_type_str[gid_type])
+               return gid_type_str[gid_type];
+
+       return "Invalid GID type";
+}
+EXPORT_SYMBOL(ib_cache_gid_type_str);
+
+int ib_cache_gid_parse_type_str(const char *buf)
+{
+       unsigned int i;
+       size_t len;
+       int err = -EINVAL;
+
+       len = strlen(buf);
+       if (len == 0)
+               return -EINVAL;
+
+       if (buf[len - 1] == '\n')
+               len--;
+
+       for (i = 0; i < ARRAY_SIZE(gid_type_str); ++i)
+               if (gid_type_str[i] && !strncmp(buf, gid_type_str[i], len) &&
+                   len == strlen(gid_type_str[i])) {
+                       err = i;
+                       break;
+               }
+
+       return err;
+}
+EXPORT_SYMBOL(ib_cache_gid_parse_type_str);
+
+/* This function expects that rwlock will be write locked in all
+ * scenarios and that lock will be locked in sleep-able (RoCE)
+ * scenarios.
+ */
 static int write_gid(struct ib_device *ib_dev, u8 port,
                     struct ib_gid_table *table, int ix,
                     const union ib_gid *gid,
                     const struct ib_gid_attr *attr,
                     enum gid_table_write_action action,
                     bool  default_gid)
+       __releases(&table->rwlock) __acquires(&table->rwlock)
 {
        int ret = 0;
        struct net_device *old_net_dev;
-       unsigned long flags;
 
        /* in rdma_cap_roce_gid_table, this funciton should be protected by a
         * sleep-able lock.
         */
-       write_lock_irqsave(&table->data_vec[ix].lock, flags);
 
        if (rdma_cap_roce_gid_table(ib_dev, port)) {
                table->data_vec[ix].props |= GID_TABLE_ENTRY_INVALID;
-               write_unlock_irqrestore(&table->data_vec[ix].lock, flags);
+               write_unlock_irq(&table->rwlock);
                /* GID_TABLE_WRITE_ACTION_MODIFY currently isn't supported by
                 * RoCE providers and thus only updates the cache.
                 */
@@ -140,7 +195,7 @@ static int write_gid(struct ib_device *ib_dev, u8 port,
                else if (action == GID_TABLE_WRITE_ACTION_DEL)
                        ret = ib_dev->del_gid(ib_dev, port, ix,
                                              &table->data_vec[ix].context);
-               write_lock_irqsave(&table->data_vec[ix].lock, flags);
+               write_lock_irq(&table->rwlock);
        }
 
        old_net_dev = table->data_vec[ix].attr.ndev;
@@ -162,17 +217,6 @@ static int write_gid(struct ib_device *ib_dev, u8 port,
 
        table->data_vec[ix].props &= ~GID_TABLE_ENTRY_INVALID;
 
-       write_unlock_irqrestore(&table->data_vec[ix].lock, flags);
-
-       if (!ret && rdma_cap_roce_gid_table(ib_dev, port)) {
-               struct ib_event event;
-
-               event.device            = ib_dev;
-               event.element.port_num  = port;
-               event.event             = IB_EVENT_GID_CHANGE;
-
-               ib_dispatch_event(&event);
-       }
        return ret;
 }
 
@@ -201,41 +245,58 @@ static int del_gid(struct ib_device *ib_dev, u8 port,
                         GID_TABLE_WRITE_ACTION_DEL, default_gid);
 }
 
+/* rwlock should be read locked */
 static int find_gid(struct ib_gid_table *table, const union ib_gid *gid,
                    const struct ib_gid_attr *val, bool default_gid,
-                   unsigned long mask)
+                   unsigned long mask, int *pempty)
 {
-       int i;
+       int i = 0;
+       int found = -1;
+       int empty = pempty ? -1 : 0;
 
-       for (i = 0; i < table->sz; i++) {
-               unsigned long flags;
-               struct ib_gid_attr *attr = &table->data_vec[i].attr;
+       while (i < table->sz && (found < 0 || empty < 0)) {
+               struct ib_gid_table_entry *data = &table->data_vec[i];
+               struct ib_gid_attr *attr = &data->attr;
+               int curr_index = i;
 
-               read_lock_irqsave(&table->data_vec[i].lock, flags);
+               i++;
 
-               if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID)
-                       goto next;
+               if (data->props & GID_TABLE_ENTRY_INVALID)
+                       continue;
+
+               if (empty < 0)
+                       if (!memcmp(&data->gid, &zgid, sizeof(*gid)) &&
+                           !memcmp(attr, &zattr, sizeof(*attr)) &&
+                           !data->props)
+                               empty = curr_index;
+
+               if (found >= 0)
+                       continue;
+
+               if (mask & GID_ATTR_FIND_MASK_GID_TYPE &&
+                   attr->gid_type != val->gid_type)
+                       continue;
 
                if (mask & GID_ATTR_FIND_MASK_GID &&
-                   memcmp(gid, &table->data_vec[i].gid, sizeof(*gid)))
-                       goto next;
+                   memcmp(gid, &data->gid, sizeof(*gid)))
+                       continue;
 
                if (mask & GID_ATTR_FIND_MASK_NETDEV &&
                    attr->ndev != val->ndev)
-                       goto next;
+                       continue;
 
                if (mask & GID_ATTR_FIND_MASK_DEFAULT &&
-                   !!(table->data_vec[i].props & GID_TABLE_ENTRY_DEFAULT) !=
+                   !!(data->props & GID_TABLE_ENTRY_DEFAULT) !=
                    default_gid)
-                       goto next;
+                       continue;
 
-               read_unlock_irqrestore(&table->data_vec[i].lock, flags);
-               return i;
-next:
-               read_unlock_irqrestore(&table->data_vec[i].lock, flags);
+               found = curr_index;
        }
 
-       return -1;
+       if (pempty)
+               *pempty = empty;
+
+       return found;
 }
 
 static void make_default_gid(struct  net_device *dev, union ib_gid *gid)
@@ -252,6 +313,7 @@ int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
        int ix;
        int ret = 0;
        struct net_device *idev;
+       int empty;
 
        table = ports_table[port - rdma_start_port(ib_dev)];
 
@@ -275,22 +337,25 @@ int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
        }
 
        mutex_lock(&table->lock);
+       write_lock_irq(&table->rwlock);
 
        ix = find_gid(table, gid, attr, false, GID_ATTR_FIND_MASK_GID |
-                     GID_ATTR_FIND_MASK_NETDEV);
+                     GID_ATTR_FIND_MASK_GID_TYPE |
+                     GID_ATTR_FIND_MASK_NETDEV, &empty);
        if (ix >= 0)
                goto out_unlock;
 
-       ix = find_gid(table, &zgid, NULL, false, GID_ATTR_FIND_MASK_GID |
-                     GID_ATTR_FIND_MASK_DEFAULT);
-       if (ix < 0) {
+       if (empty < 0) {
                ret = -ENOSPC;
                goto out_unlock;
        }
 
-       add_gid(ib_dev, port, table, ix, gid, attr, false);
+       ret = add_gid(ib_dev, port, table, empty, gid, attr, false);
+       if (!ret)
+               dispatch_gid_change_event(ib_dev, port);
 
 out_unlock:
+       write_unlock_irq(&table->rwlock);
        mutex_unlock(&table->lock);
        return ret;
 }
@@ -305,17 +370,22 @@ int ib_cache_gid_del(struct ib_device *ib_dev, u8 port,
        table = ports_table[port - rdma_start_port(ib_dev)];
 
        mutex_lock(&table->lock);
+       write_lock_irq(&table->rwlock);
 
        ix = find_gid(table, gid, attr, false,
                      GID_ATTR_FIND_MASK_GID      |
+                     GID_ATTR_FIND_MASK_GID_TYPE |
                      GID_ATTR_FIND_MASK_NETDEV   |
-                     GID_ATTR_FIND_MASK_DEFAULT);
+                     GID_ATTR_FIND_MASK_DEFAULT,
+                     NULL);
        if (ix < 0)
                goto out_unlock;
 
-       del_gid(ib_dev, port, table, ix, false);
+       if (!del_gid(ib_dev, port, table, ix, false))
+               dispatch_gid_change_event(ib_dev, port);
 
 out_unlock:
+       write_unlock_irq(&table->rwlock);
        mutex_unlock(&table->lock);
        return 0;
 }
@@ -326,16 +396,24 @@ int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
        struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
        struct ib_gid_table *table;
        int ix;
+       bool deleted = false;
 
        table  = ports_table[port - rdma_start_port(ib_dev)];
 
        mutex_lock(&table->lock);
+       write_lock_irq(&table->rwlock);
 
        for (ix = 0; ix < table->sz; ix++)
                if (table->data_vec[ix].attr.ndev == ndev)
-                       del_gid(ib_dev, port, table, ix, false);
+                       if (!del_gid(ib_dev, port, table, ix, false))
+                               deleted = true;
 
+       write_unlock_irq(&table->rwlock);
        mutex_unlock(&table->lock);
+
+       if (deleted)
+               dispatch_gid_change_event(ib_dev, port);
+
        return 0;
 }
 
@@ -344,18 +422,14 @@ static int __ib_cache_gid_get(struct ib_device *ib_dev, u8 port, int index,
 {
        struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
        struct ib_gid_table *table;
-       unsigned long flags;
 
        table = ports_table[port - rdma_start_port(ib_dev)];
 
        if (index < 0 || index >= table->sz)
                return -EINVAL;
 
-       read_lock_irqsave(&table->data_vec[index].lock, flags);
-       if (table->data_vec[index].props & GID_TABLE_ENTRY_INVALID) {
-               read_unlock_irqrestore(&table->data_vec[index].lock, flags);
+       if (table->data_vec[index].props & GID_TABLE_ENTRY_INVALID)
                return -EAGAIN;
-       }
 
        memcpy(gid, &table->data_vec[index].gid, sizeof(*gid));
        if (attr) {
@@ -364,7 +438,6 @@ static int __ib_cache_gid_get(struct ib_device *ib_dev, u8 port, int index,
                        dev_hold(attr->ndev);
        }
 
-       read_unlock_irqrestore(&table->data_vec[index].lock, flags);
        return 0;
 }
 
@@ -378,17 +451,21 @@ static int _ib_cache_gid_table_find(struct ib_device *ib_dev,
        struct ib_gid_table *table;
        u8 p;
        int local_index;
+       unsigned long flags;
 
        for (p = 0; p < ib_dev->phys_port_cnt; p++) {
                table = ports_table[p];
-               local_index = find_gid(table, gid, val, false, mask);
+               read_lock_irqsave(&table->rwlock, flags);
+               local_index = find_gid(table, gid, val, false, mask, NULL);
                if (local_index >= 0) {
                        if (index)
                                *index = local_index;
                        if (port)
                                *port = p + rdma_start_port(ib_dev);
+                       read_unlock_irqrestore(&table->rwlock, flags);
                        return 0;
                }
+               read_unlock_irqrestore(&table->rwlock, flags);
        }
 
        return -ENOENT;
@@ -396,11 +473,13 @@ static int _ib_cache_gid_table_find(struct ib_device *ib_dev,
 
 static int ib_cache_gid_find(struct ib_device *ib_dev,
                             const union ib_gid *gid,
+                            enum ib_gid_type gid_type,
                             struct net_device *ndev, u8 *port,
                             u16 *index)
 {
-       unsigned long mask = GID_ATTR_FIND_MASK_GID;
-       struct ib_gid_attr gid_attr_val = {.ndev = ndev};
+       unsigned long mask = GID_ATTR_FIND_MASK_GID |
+                            GID_ATTR_FIND_MASK_GID_TYPE;
+       struct ib_gid_attr gid_attr_val = {.ndev = ndev, .gid_type = gid_type};
 
        if (ndev)
                mask |= GID_ATTR_FIND_MASK_NETDEV;
@@ -411,14 +490,17 @@ static int ib_cache_gid_find(struct ib_device *ib_dev,
 
 int ib_find_cached_gid_by_port(struct ib_device *ib_dev,
                               const union ib_gid *gid,
+                              enum ib_gid_type gid_type,
                               u8 port, struct net_device *ndev,
                               u16 *index)
 {
        int local_index;
        struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
        struct ib_gid_table *table;
-       unsigned long mask = GID_ATTR_FIND_MASK_GID;
-       struct ib_gid_attr val = {.ndev = ndev};
+       unsigned long mask = GID_ATTR_FIND_MASK_GID |
+                            GID_ATTR_FIND_MASK_GID_TYPE;
+       struct ib_gid_attr val = {.ndev = ndev, .gid_type = gid_type};
+       unsigned long flags;
 
        if (port < rdma_start_port(ib_dev) ||
            port > rdma_end_port(ib_dev))
@@ -429,13 +511,16 @@ int ib_find_cached_gid_by_port(struct ib_device *ib_dev,
        if (ndev)
                mask |= GID_ATTR_FIND_MASK_NETDEV;
 
-       local_index = find_gid(table, gid, &val, false, mask);
+       read_lock_irqsave(&table->rwlock, flags);
+       local_index = find_gid(table, gid, &val, false, mask, NULL);
        if (local_index >= 0) {
                if (index)
                        *index = local_index;
+               read_unlock_irqrestore(&table->rwlock, flags);
                return 0;
        }
 
+       read_unlock_irqrestore(&table->rwlock, flags);
        return -ENOENT;
 }
 EXPORT_SYMBOL(ib_find_cached_gid_by_port);
@@ -472,6 +557,7 @@ static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev,
        struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
        struct ib_gid_table *table;
        unsigned int i;
+       unsigned long flags;
        bool found = false;
 
        if (!ports_table)
@@ -484,11 +570,10 @@ static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev,
 
        table = ports_table[port - rdma_start_port(ib_dev)];
 
+       read_lock_irqsave(&table->rwlock, flags);
        for (i = 0; i < table->sz; i++) {
                struct ib_gid_attr attr;
-               unsigned long flags;
 
-               read_lock_irqsave(&table->data_vec[i].lock, flags);
                if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID)
                        goto next;
 
@@ -501,11 +586,10 @@ static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev,
                        found = true;
 
 next:
-               read_unlock_irqrestore(&table->data_vec[i].lock, flags);
-
                if (found)
                        break;
        }
+       read_unlock_irqrestore(&table->rwlock, flags);
 
        if (!found)
                return -ENOENT;
@@ -517,9 +601,9 @@ next:
 
 static struct ib_gid_table *alloc_gid_table(int sz)
 {
-       unsigned int i;
        struct ib_gid_table *table =
                kzalloc(sizeof(struct ib_gid_table), GFP_KERNEL);
+
        if (!table)
                return NULL;
 
@@ -530,9 +614,7 @@ static struct ib_gid_table *alloc_gid_table(int sz)
        mutex_init(&table->lock);
 
        table->sz = sz;
-
-       for (i = 0; i < sz; i++)
-               rwlock_init(&table->data_vec[i].lock);
+       rwlock_init(&table->rwlock);
 
        return table;
 
@@ -553,30 +635,37 @@ static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port,
                                   struct ib_gid_table *table)
 {
        int i;
+       bool deleted = false;
 
        if (!table)
                return;
 
+       write_lock_irq(&table->rwlock);
        for (i = 0; i < table->sz; ++i) {
                if (memcmp(&table->data_vec[i].gid, &zgid,
                           sizeof(table->data_vec[i].gid)))
-                       del_gid(ib_dev, port, table, i,
-                               table->data_vec[i].props &
-                               GID_ATTR_FIND_MASK_DEFAULT);
+                       if (!del_gid(ib_dev, port, table, i,
+                                    table->data_vec[i].props &
+                                    GID_ATTR_FIND_MASK_DEFAULT))
+                               deleted = true;
        }
+       write_unlock_irq(&table->rwlock);
+
+       if (deleted)
+               dispatch_gid_change_event(ib_dev, port);
 }
 
 void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port,
                                  struct net_device *ndev,
+                                 unsigned long gid_type_mask,
                                  enum ib_cache_gid_default_mode mode)
 {
        struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
        union ib_gid gid;
        struct ib_gid_attr gid_attr;
+       struct ib_gid_attr zattr_type = zattr;
        struct ib_gid_table *table;
-       int ix;
-       union ib_gid current_gid;
-       struct ib_gid_attr current_gid_attr = {};
+       unsigned int gid_type;
 
        table  = ports_table[port - rdma_start_port(ib_dev)];
 
@@ -584,46 +673,82 @@ void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port,
        memset(&gid_attr, 0, sizeof(gid_attr));
        gid_attr.ndev = ndev;
 
-       mutex_lock(&table->lock);
-       ix = find_gid(table, NULL, NULL, true, GID_ATTR_FIND_MASK_DEFAULT);
-
-       /* Coudn't find default GID location */
-       WARN_ON(ix < 0);
-
-       if (!__ib_cache_gid_get(ib_dev, port, ix,
-                               &current_gid, &current_gid_attr) &&
-           mode == IB_CACHE_GID_DEFAULT_MODE_SET &&
-           !memcmp(&gid, &current_gid, sizeof(gid)) &&
-           !memcmp(&gid_attr, &current_gid_attr, sizeof(gid_attr)))
-               goto unlock;
-
-       if ((memcmp(&current_gid, &zgid, sizeof(current_gid)) ||
-            memcmp(&current_gid_attr, &zattr,
-                   sizeof(current_gid_attr))) &&
-           del_gid(ib_dev, port, table, ix, true)) {
-               pr_warn("ib_cache_gid: can't delete index %d for default gid %pI6\n",
-                       ix, gid.raw);
-               goto unlock;
-       }
+       for (gid_type = 0; gid_type < IB_GID_TYPE_SIZE; ++gid_type) {
+               int ix;
+               union ib_gid current_gid;
+               struct ib_gid_attr current_gid_attr = {};
+
+               if (1UL << gid_type & ~gid_type_mask)
+                       continue;
+
+               gid_attr.gid_type = gid_type;
+
+               mutex_lock(&table->lock);
+               write_lock_irq(&table->rwlock);
+               ix = find_gid(table, NULL, &gid_attr, true,
+                             GID_ATTR_FIND_MASK_GID_TYPE |
+                             GID_ATTR_FIND_MASK_DEFAULT,
+                             NULL);
+
+               /* Coudn't find default GID location */
+               WARN_ON(ix < 0);
+
+               zattr_type.gid_type = gid_type;
+
+               if (!__ib_cache_gid_get(ib_dev, port, ix,
+                                       &current_gid, &current_gid_attr) &&
+                   mode == IB_CACHE_GID_DEFAULT_MODE_SET &&
+                   !memcmp(&gid, &current_gid, sizeof(gid)) &&
+                   !memcmp(&gid_attr, &current_gid_attr, sizeof(gid_attr)))
+                       goto release;
+
+               if (memcmp(&current_gid, &zgid, sizeof(current_gid)) ||
+                   memcmp(&current_gid_attr, &zattr_type,
+                          sizeof(current_gid_attr))) {
+                       if (del_gid(ib_dev, port, table, ix, true)) {
+                               pr_warn("ib_cache_gid: can't delete index %d for default gid %pI6\n",
+                                       ix, gid.raw);
+                               goto release;
+                       } else {
+                               dispatch_gid_change_event(ib_dev, port);
+                       }
+               }
 
-       if (mode == IB_CACHE_GID_DEFAULT_MODE_SET)
-               if (add_gid(ib_dev, port, table, ix, &gid, &gid_attr, true))
-                       pr_warn("ib_cache_gid: unable to add default gid %pI6\n",
-                               gid.raw);
+               if (mode == IB_CACHE_GID_DEFAULT_MODE_SET) {
+                       if (add_gid(ib_dev, port, table, ix, &gid, &gid_attr, true))
+                               pr_warn("ib_cache_gid: unable to add default gid %pI6\n",
+                                       gid.raw);
+                       else
+                               dispatch_gid_change_event(ib_dev, port);
+               }
 
-unlock:
-       if (current_gid_attr.ndev)
-               dev_put(current_gid_attr.ndev);
-       mutex_unlock(&table->lock);
+release:
+               if (current_gid_attr.ndev)
+                       dev_put(current_gid_attr.ndev);
+               write_unlock_irq(&table->rwlock);
+               mutex_unlock(&table->lock);
+       }
 }
 
 static int gid_table_reserve_default(struct ib_device *ib_dev, u8 port,
                                     struct ib_gid_table *table)
 {
-       if (rdma_protocol_roce(ib_dev, port)) {
-               struct ib_gid_table_entry *entry = &table->data_vec[0];
+       unsigned int i;
+       unsigned long roce_gid_type_mask;
+       unsigned int num_default_gids;
+       unsigned int current_gid = 0;
+
+       roce_gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
+       num_default_gids = hweight_long(roce_gid_type_mask);
+       for (i = 0; i < num_default_gids && i < table->sz; i++) {
+               struct ib_gid_table_entry *entry =
+                       &table->data_vec[i];
 
                entry->props |= GID_TABLE_ENTRY_DEFAULT;
+               current_gid = find_next_bit(&roce_gid_type_mask,
+                                           BITS_PER_LONG,
+                                           current_gid);
+               entry->attr.gid_type = current_gid++;
        }
 
        return 0;
@@ -728,20 +853,30 @@ int ib_get_cached_gid(struct ib_device *device,
                      union ib_gid     *gid,
                      struct ib_gid_attr *gid_attr)
 {
+       int res;
+       unsigned long flags;
+       struct ib_gid_table **ports_table = device->cache.gid_cache;
+       struct ib_gid_table *table = ports_table[port_num - rdma_start_port(device)];
+
        if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device))
                return -EINVAL;
 
-       return __ib_cache_gid_get(device, port_num, index, gid, gid_attr);
+       read_lock_irqsave(&table->rwlock, flags);
+       res = __ib_cache_gid_get(device, port_num, index, gid, gid_attr);
+       read_unlock_irqrestore(&table->rwlock, flags);
+
+       return res;
 }
 EXPORT_SYMBOL(ib_get_cached_gid);
 
 int ib_find_cached_gid(struct ib_device *device,
                       const union ib_gid *gid,
+                      enum ib_gid_type gid_type,
                       struct net_device *ndev,
                       u8               *port_num,
                       u16              *index)
 {
-       return ib_cache_gid_find(device, gid, ndev, port_num, index);
+       return ib_cache_gid_find(device, gid, gid_type, ndev, port_num, index);
 }
 EXPORT_SYMBOL(ib_find_cached_gid);
 
@@ -956,10 +1091,12 @@ static void ib_cache_update(struct ib_device *device,
 
        device->cache.pkey_cache[port - rdma_start_port(device)] = pkey_cache;
        if (!use_roce_gid_table) {
+               write_lock(&table->rwlock);
                for (i = 0; i < gid_cache->table_len; i++) {
                        modify_gid(device, port, table, i, gid_cache->table + i,
                                   &zattr, false);
                }
+               write_unlock(&table->rwlock);
        }
 
        device->cache.lmc_cache[port - rdma_start_port(device)] = tprops->lmc;
index 0a26dd6d9b19f96d97c9b3b244509892afae7023..1d92e091e22ef0bdb64ee7433eb2cf5fbba9600c 100644 (file)
@@ -364,7 +364,7 @@ static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av)
        read_lock_irqsave(&cm.device_lock, flags);
        list_for_each_entry(cm_dev, &cm.device_list, list) {
                if (!ib_find_cached_gid(cm_dev->ib_device, &path->sgid,
-                                       ndev, &p, NULL)) {
+                                       path->gid_type, ndev, &p, NULL)) {
                        port = cm_dev->port[p-1];
                        break;
                }
@@ -782,11 +782,11 @@ static void cm_enter_timewait(struct cm_id_private *cm_id_priv)
        wait_time = cm_convert_to_ms(cm_id_priv->av.timeout);
 
        /* Check if the device started its remove_one */
-       spin_lock_irq(&cm.lock);
+       spin_lock_irqsave(&cm.lock, flags);
        if (!cm_dev->going_down)
                queue_delayed_work(cm.wq, &cm_id_priv->timewait_info->work.work,
                                   msecs_to_jiffies(wait_time));
-       spin_unlock_irq(&cm.lock);
+       spin_unlock_irqrestore(&cm.lock, flags);
 
        cm_id_priv->timewait_info = NULL;
 }
@@ -1600,6 +1600,8 @@ static int cm_req_handler(struct cm_work *work)
        struct ib_cm_id *cm_id;
        struct cm_id_private *cm_id_priv, *listen_cm_id_priv;
        struct cm_req_msg *req_msg;
+       union ib_gid gid;
+       struct ib_gid_attr gid_attr;
        int ret;
 
        req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
@@ -1639,11 +1641,31 @@ static int cm_req_handler(struct cm_work *work)
        cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]);
 
        memcpy(work->path[0].dmac, cm_id_priv->av.ah_attr.dmac, ETH_ALEN);
-       ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av);
+       work->path[0].hop_limit = cm_id_priv->av.ah_attr.grh.hop_limit;
+       ret = ib_get_cached_gid(work->port->cm_dev->ib_device,
+                               work->port->port_num,
+                               cm_id_priv->av.ah_attr.grh.sgid_index,
+                               &gid, &gid_attr);
+       if (!ret) {
+               if (gid_attr.ndev) {
+                       work->path[0].ifindex = gid_attr.ndev->ifindex;
+                       work->path[0].net = dev_net(gid_attr.ndev);
+                       dev_put(gid_attr.ndev);
+               }
+               work->path[0].gid_type = gid_attr.gid_type;
+               ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av);
+       }
        if (ret) {
-               ib_get_cached_gid(work->port->cm_dev->ib_device,
-                                 work->port->port_num, 0, &work->path[0].sgid,
-                                 NULL);
+               int err = ib_get_cached_gid(work->port->cm_dev->ib_device,
+                                           work->port->port_num, 0,
+                                           &work->path[0].sgid,
+                                           &gid_attr);
+               if (!err && gid_attr.ndev) {
+                       work->path[0].ifindex = gid_attr.ndev->ifindex;
+                       work->path[0].net = dev_net(gid_attr.ndev);
+                       dev_put(gid_attr.ndev);
+               }
+               work->path[0].gid_type = gid_attr.gid_type;
                ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID,
                               &work->path[0].sgid, sizeof work->path[0].sgid,
                               NULL, 0);
@@ -3482,6 +3504,7 @@ int ib_cm_notify(struct ib_cm_id *cm_id, enum ib_event_type event)
 EXPORT_SYMBOL(ib_cm_notify);
 
 static void cm_recv_handler(struct ib_mad_agent *mad_agent,
+                           struct ib_mad_send_buf *send_buf,
                            struct ib_mad_recv_wc *mad_recv_wc)
 {
        struct cm_port *port = mad_agent->context;
@@ -3731,16 +3754,6 @@ int ib_cm_init_qp_attr(struct ib_cm_id *cm_id,
 }
 EXPORT_SYMBOL(ib_cm_init_qp_attr);
 
-static void cm_get_ack_delay(struct cm_device *cm_dev)
-{
-       struct ib_device_attr attr;
-
-       if (ib_query_device(cm_dev->ib_device, &attr))
-               cm_dev->ack_delay = 0; /* acks will rely on packet life time */
-       else
-               cm_dev->ack_delay = attr.local_ca_ack_delay;
-}
-
 static ssize_t cm_show_counter(struct kobject *obj, struct attribute *attr,
                               char *buf)
 {
@@ -3852,7 +3865,7 @@ static void cm_add_one(struct ib_device *ib_device)
                return;
 
        cm_dev->ib_device = ib_device;
-       cm_get_ack_delay(cm_dev);
+       cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay;
        cm_dev->going_down = 0;
        cm_dev->device = device_create(&cm_class, &ib_device->dev,
                                       MKDEV(0, 0), NULL,
index 2d762a2ecd81252cde34cbc2dbc1083c1d8832dc..9729639df407abdc382959ef8151fbbcc3370589 100644 (file)
@@ -38,6 +38,7 @@
 #include <linux/in6.h>
 #include <linux/mutex.h>
 #include <linux/random.h>
+#include <linux/igmp.h>
 #include <linux/idr.h>
 #include <linux/inetdevice.h>
 #include <linux/slab.h>
@@ -60,6 +61,8 @@
 #include <rdma/ib_sa.h>
 #include <rdma/iw_cm.h>
 
+#include "core_priv.h"
+
 MODULE_AUTHOR("Sean Hefty");
 MODULE_DESCRIPTION("Generic RDMA CM Agent");
 MODULE_LICENSE("Dual BSD/GPL");
@@ -150,6 +153,7 @@ struct cma_device {
        struct completion       comp;
        atomic_t                refcount;
        struct list_head        id_list;
+       enum ib_gid_type        *default_gid_type;
 };
 
 struct rdma_bind_list {
@@ -185,6 +189,67 @@ enum {
        CMA_OPTION_AFONLY,
 };
 
+void cma_ref_dev(struct cma_device *cma_dev)
+{
+       atomic_inc(&cma_dev->refcount);
+}
+
+struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter,
+                                            void               *cookie)
+{
+       struct cma_device *cma_dev;
+       struct cma_device *found_cma_dev = NULL;
+
+       mutex_lock(&lock);
+
+       list_for_each_entry(cma_dev, &dev_list, list)
+               if (filter(cma_dev->device, cookie)) {
+                       found_cma_dev = cma_dev;
+                       break;
+               }
+
+       if (found_cma_dev)
+               cma_ref_dev(found_cma_dev);
+       mutex_unlock(&lock);
+       return found_cma_dev;
+}
+
+int cma_get_default_gid_type(struct cma_device *cma_dev,
+                            unsigned int port)
+{
+       if (port < rdma_start_port(cma_dev->device) ||
+           port > rdma_end_port(cma_dev->device))
+               return -EINVAL;
+
+       return cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)];
+}
+
+int cma_set_default_gid_type(struct cma_device *cma_dev,
+                            unsigned int port,
+                            enum ib_gid_type default_gid_type)
+{
+       unsigned long supported_gids;
+
+       if (port < rdma_start_port(cma_dev->device) ||
+           port > rdma_end_port(cma_dev->device))
+               return -EINVAL;
+
+       supported_gids = roce_gid_type_mask_support(cma_dev->device, port);
+
+       if (!(supported_gids & 1 << default_gid_type))
+               return -EINVAL;
+
+       cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)] =
+               default_gid_type;
+
+       return 0;
+}
+
+struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev)
+{
+       return cma_dev->device;
+}
+
 /*
  * Device removal can occur at anytime, so we need extra handling to
  * serialize notifying the user of device removal with other callbacks.
@@ -228,6 +293,7 @@ struct rdma_id_private {
        u8                      tos;
        u8                      reuseaddr;
        u8                      afonly;
+       enum ib_gid_type        gid_type;
 };
 
 struct cma_multicast {
@@ -239,6 +305,7 @@ struct cma_multicast {
        void                    *context;
        struct sockaddr_storage addr;
        struct kref             mcref;
+       bool                    igmp_joined;
 };
 
 struct cma_work {
@@ -335,18 +402,48 @@ static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver)
        hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF);
 }
 
-static void cma_attach_to_dev(struct rdma_id_private *id_priv,
-                             struct cma_device *cma_dev)
+static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join)
 {
-       atomic_inc(&cma_dev->refcount);
+       struct in_device *in_dev = NULL;
+
+       if (ndev) {
+               rtnl_lock();
+               in_dev = __in_dev_get_rtnl(ndev);
+               if (in_dev) {
+                       if (join)
+                               ip_mc_inc_group(in_dev,
+                                               *(__be32 *)(mgid->raw + 12));
+                       else
+                               ip_mc_dec_group(in_dev,
+                                               *(__be32 *)(mgid->raw + 12));
+               }
+               rtnl_unlock();
+       }
+       return (in_dev) ? 0 : -ENODEV;
+}
+
+static void _cma_attach_to_dev(struct rdma_id_private *id_priv,
+                              struct cma_device *cma_dev)
+{
+       cma_ref_dev(cma_dev);
        id_priv->cma_dev = cma_dev;
+       id_priv->gid_type = 0;
        id_priv->id.device = cma_dev->device;
        id_priv->id.route.addr.dev_addr.transport =
                rdma_node_get_transport(cma_dev->device->node_type);
        list_add_tail(&id_priv->list, &cma_dev->id_list);
 }
 
-static inline void cma_deref_dev(struct cma_device *cma_dev)
+static void cma_attach_to_dev(struct rdma_id_private *id_priv,
+                             struct cma_device *cma_dev)
+{
+       _cma_attach_to_dev(id_priv, cma_dev);
+       id_priv->gid_type =
+               cma_dev->default_gid_type[id_priv->id.port_num -
+                                         rdma_start_port(cma_dev->device)];
+}
+
+void cma_deref_dev(struct cma_device *cma_dev)
 {
        if (atomic_dec_and_test(&cma_dev->refcount))
                complete(&cma_dev->comp);
@@ -441,6 +538,7 @@ static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_a
 }
 
 static inline int cma_validate_port(struct ib_device *device, u8 port,
+                                   enum ib_gid_type gid_type,
                                      union ib_gid *gid, int dev_type,
                                      int bound_if_index)
 {
@@ -453,10 +551,25 @@ static inline int cma_validate_port(struct ib_device *device, u8 port,
        if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port))
                return ret;
 
-       if (dev_type == ARPHRD_ETHER)
+       if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) {
                ndev = dev_get_by_index(&init_net, bound_if_index);
+               if (ndev && ndev->flags & IFF_LOOPBACK) {
+                       pr_info("detected loopback device\n");
+                       dev_put(ndev);
 
-       ret = ib_find_cached_gid_by_port(device, gid, port, ndev, NULL);
+                       if (!device->get_netdev)
+                               return -EOPNOTSUPP;
+
+                       ndev = device->get_netdev(device, port);
+                       if (!ndev)
+                               return -ENODEV;
+               }
+       } else {
+               gid_type = IB_GID_TYPE_IB;
+       }
+
+       ret = ib_find_cached_gid_by_port(device, gid, gid_type, port,
+                                        ndev, NULL);
 
        if (ndev)
                dev_put(ndev);
@@ -490,7 +603,10 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv,
                gidp = rdma_protocol_roce(cma_dev->device, port) ?
                       &iboe_gid : &gid;
 
-               ret = cma_validate_port(cma_dev->device, port, gidp,
+               ret = cma_validate_port(cma_dev->device, port,
+                                       rdma_protocol_ib(cma_dev->device, port) ?
+                                       IB_GID_TYPE_IB :
+                                       listen_id_priv->gid_type, gidp,
                                        dev_addr->dev_type,
                                        dev_addr->bound_dev_if);
                if (!ret) {
@@ -509,8 +625,11 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv,
                        gidp = rdma_protocol_roce(cma_dev->device, port) ?
                               &iboe_gid : &gid;
 
-                       ret = cma_validate_port(cma_dev->device, port, gidp,
-                                               dev_addr->dev_type,
+                       ret = cma_validate_port(cma_dev->device, port,
+                                               rdma_protocol_ib(cma_dev->device, port) ?
+                                               IB_GID_TYPE_IB :
+                                               cma_dev->default_gid_type[port - 1],
+                                               gidp, dev_addr->dev_type,
                                                dev_addr->bound_dev_if);
                        if (!ret) {
                                id_priv->id.port_num = port;
@@ -1437,8 +1556,24 @@ static void cma_leave_mc_groups(struct rdma_id_private *id_priv)
                                      id_priv->id.port_num)) {
                        ib_sa_free_multicast(mc->multicast.ib);
                        kfree(mc);
-               } else
+               } else {
+                       if (mc->igmp_joined) {
+                               struct rdma_dev_addr *dev_addr =
+                                       &id_priv->id.route.addr.dev_addr;
+                               struct net_device *ndev = NULL;
+
+                               if (dev_addr->bound_dev_if)
+                                       ndev = dev_get_by_index(&init_net,
+                                                               dev_addr->bound_dev_if);
+                               if (ndev) {
+                                       cma_igmp_send(ndev,
+                                                     &mc->multicast.ib->rec.mgid,
+                                                     false);
+                                       dev_put(ndev);
+                               }
+                       }
                        kref_put(&mc->mcref, release_mc);
+               }
        }
 }
 
@@ -1896,7 +2031,6 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
        struct rdma_id_private *listen_id, *conn_id;
        struct rdma_cm_event event;
        int ret;
-       struct ib_device_attr attr;
        struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr;
        struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr;
 
@@ -1938,13 +2072,6 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
        memcpy(cma_src_addr(conn_id), laddr, rdma_addr_size(laddr));
        memcpy(cma_dst_addr(conn_id), raddr, rdma_addr_size(raddr));
 
-       ret = ib_query_device(conn_id->id.device, &attr);
-       if (ret) {
-               mutex_unlock(&conn_id->handler_mutex);
-               rdma_destroy_id(new_cm_id);
-               goto out;
-       }
-
        memset(&event, 0, sizeof event);
        event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
        event.param.conn.private_data = iw_event->private_data;
@@ -2051,7 +2178,7 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv,
        memcpy(cma_src_addr(dev_id_priv), cma_src_addr(id_priv),
               rdma_addr_size(cma_src_addr(id_priv)));
 
-       cma_attach_to_dev(dev_id_priv, cma_dev);
+       _cma_attach_to_dev(dev_id_priv, cma_dev);
        list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list);
        atomic_inc(&id_priv->refcount);
        dev_id_priv->internal_id = 1;
@@ -2321,8 +2448,23 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
 
        if (addr->dev_addr.bound_dev_if) {
                ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if);
+               if (!ndev)
+                       return -ENODEV;
+
+               if (ndev->flags & IFF_LOOPBACK) {
+                       dev_put(ndev);
+                       if (!id_priv->id.device->get_netdev)
+                               return -EOPNOTSUPP;
+
+                       ndev = id_priv->id.device->get_netdev(id_priv->id.device,
+                                                             id_priv->id.port_num);
+                       if (!ndev)
+                               return -ENODEV;
+               }
+
                route->path_rec->net = &init_net;
-               route->path_rec->ifindex = addr->dev_addr.bound_dev_if;
+               route->path_rec->ifindex = ndev->ifindex;
+               route->path_rec->gid_type = id_priv->gid_type;
        }
        if (!ndev) {
                ret = -ENODEV;
@@ -2336,7 +2478,14 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
        rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr,
                    &route->path_rec->dgid);
 
-       route->path_rec->hop_limit = 1;
+       /* Use the hint from IP Stack to select GID Type */
+       if (route->path_rec->gid_type < ib_network_to_gid_type(addr->dev_addr.network))
+               route->path_rec->gid_type = ib_network_to_gid_type(addr->dev_addr.network);
+       if (((struct sockaddr *)&id_priv->id.route.addr.dst_addr)->sa_family != AF_IB)
+               /* TODO: get the hoplimit from the inet/inet6 device */
+               route->path_rec->hop_limit = addr->dev_addr.hoplimit;
+       else
+               route->path_rec->hop_limit = 1;
        route->path_rec->reversible = 1;
        route->path_rec->pkey = cpu_to_be16(0xffff);
        route->path_rec->mtu_selector = IB_SA_EQ;
@@ -3534,12 +3683,23 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
        event.status = status;
        event.param.ud.private_data = mc->context;
        if (!status) {
+               struct rdma_dev_addr *dev_addr =
+                       &id_priv->id.route.addr.dev_addr;
+               struct net_device *ndev =
+                       dev_get_by_index(&init_net, dev_addr->bound_dev_if);
+               enum ib_gid_type gid_type =
+                       id_priv->cma_dev->default_gid_type[id_priv->id.port_num -
+                       rdma_start_port(id_priv->cma_dev->device)];
+
                event.event = RDMA_CM_EVENT_MULTICAST_JOIN;
                ib_init_ah_from_mcmember(id_priv->id.device,
                                         id_priv->id.port_num, &multicast->rec,
+                                        ndev, gid_type,
                                         &event.param.ud.ah_attr);
                event.param.ud.qp_num = 0xFFFFFF;
                event.param.ud.qkey = be32_to_cpu(multicast->rec.qkey);
+               if (ndev)
+                       dev_put(ndev);
        } else
                event.event = RDMA_CM_EVENT_MULTICAST_ERROR;
 
@@ -3672,9 +3832,10 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
 {
        struct iboe_mcast_work *work;
        struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
-       int err;
+       int err = 0;
        struct sockaddr *addr = (struct sockaddr *)&mc->addr;
        struct net_device *ndev = NULL;
+       enum ib_gid_type gid_type;
 
        if (cma_zero_addr((struct sockaddr *)&mc->addr))
                return -EINVAL;
@@ -3704,9 +3865,25 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
        mc->multicast.ib->rec.rate = iboe_get_rate(ndev);
        mc->multicast.ib->rec.hop_limit = 1;
        mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->mtu);
+
+       gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num -
+                  rdma_start_port(id_priv->cma_dev->device)];
+       if (addr->sa_family == AF_INET) {
+               if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP)
+                       err = cma_igmp_send(ndev, &mc->multicast.ib->rec.mgid,
+                                           true);
+               if (!err) {
+                       mc->igmp_joined = true;
+                       mc->multicast.ib->rec.hop_limit = IPV6_DEFAULT_HOPLIMIT;
+               }
+       } else {
+               if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP)
+                       err = -ENOTSUPP;
+       }
        dev_put(ndev);
-       if (!mc->multicast.ib->rec.mtu) {
-               err = -EINVAL;
+       if (err || !mc->multicast.ib->rec.mtu) {
+               if (!err)
+                       err = -EINVAL;
                goto out2;
        }
        rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
@@ -3745,7 +3922,7 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
        memcpy(&mc->addr, addr, rdma_addr_size(addr));
        mc->context = context;
        mc->id_priv = id_priv;
-
+       mc->igmp_joined = false;
        spin_lock(&id_priv->lock);
        list_add(&mc->list, &id_priv->mc_list);
        spin_unlock(&id_priv->lock);
@@ -3790,9 +3967,25 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr)
                        if (rdma_cap_ib_mcast(id->device, id->port_num)) {
                                ib_sa_free_multicast(mc->multicast.ib);
                                kfree(mc);
-                       } else if (rdma_protocol_roce(id->device, id->port_num))
+                       } else if (rdma_protocol_roce(id->device, id->port_num)) {
+                               if (mc->igmp_joined) {
+                                       struct rdma_dev_addr *dev_addr =
+                                               &id->route.addr.dev_addr;
+                                       struct net_device *ndev = NULL;
+
+                                       if (dev_addr->bound_dev_if)
+                                               ndev = dev_get_by_index(&init_net,
+                                                                       dev_addr->bound_dev_if);
+                                       if (ndev) {
+                                               cma_igmp_send(ndev,
+                                                             &mc->multicast.ib->rec.mgid,
+                                                             false);
+                                               dev_put(ndev);
+                                       }
+                                       mc->igmp_joined = false;
+                               }
                                kref_put(&mc->mcref, release_mc);
-
+                       }
                        return;
                }
        }
@@ -3861,12 +4054,27 @@ static void cma_add_one(struct ib_device *device)
 {
        struct cma_device *cma_dev;
        struct rdma_id_private *id_priv;
+       unsigned int i;
+       unsigned long supported_gids = 0;
 
        cma_dev = kmalloc(sizeof *cma_dev, GFP_KERNEL);
        if (!cma_dev)
                return;
 
        cma_dev->device = device;
+       cma_dev->default_gid_type = kcalloc(device->phys_port_cnt,
+                                           sizeof(*cma_dev->default_gid_type),
+                                           GFP_KERNEL);
+       if (!cma_dev->default_gid_type) {
+               kfree(cma_dev);
+               return;
+       }
+       for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) {
+               supported_gids = roce_gid_type_mask_support(device, i);
+               WARN_ON(!supported_gids);
+               cma_dev->default_gid_type[i - rdma_start_port(device)] =
+                       find_first_bit(&supported_gids, BITS_PER_LONG);
+       }
 
        init_completion(&cma_dev->comp);
        atomic_set(&cma_dev->refcount, 1);
@@ -3946,6 +4154,7 @@ static void cma_remove_one(struct ib_device *device, void *client_data)
        mutex_unlock(&lock);
 
        cma_process_remove(cma_dev);
+       kfree(cma_dev->default_gid_type);
        kfree(cma_dev);
 }
 
@@ -4079,6 +4288,7 @@ static int __init cma_init(void)
 
        if (ibnl_add_client(RDMA_NL_RDMA_CM, RDMA_NL_RDMA_CM_NUM_OPS, cma_cb_table))
                printk(KERN_WARNING "RDMA CMA: failed to add netlink callback\n");
+       cma_configfs_init();
 
        return 0;
 
@@ -4093,6 +4303,7 @@ err_wq:
 
 static void __exit cma_cleanup(void)
 {
+       cma_configfs_exit();
        ibnl_remove_client(RDMA_NL_RDMA_CM);
        ib_unregister_client(&cma_client);
        unregister_netdevice_notifier(&cma_nb);
diff --git a/drivers/infiniband/core/cma_configfs.c b/drivers/infiniband/core/cma_configfs.c
new file mode 100644 (file)
index 0000000..18b112a
--- /dev/null
@@ -0,0 +1,321 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/configfs.h>
+#include <rdma/ib_verbs.h>
+#include "core_priv.h"
+
+struct cma_device;
+
+struct cma_dev_group;
+
+struct cma_dev_port_group {
+       unsigned int            port_num;
+       struct cma_dev_group    *cma_dev_group;
+       struct config_group     group;
+};
+
+struct cma_dev_group {
+       char                            name[IB_DEVICE_NAME_MAX];
+       struct config_group             device_group;
+       struct config_group             ports_group;
+       struct config_group             *default_dev_group[2];
+       struct config_group             **default_ports_group;
+       struct cma_dev_port_group       *ports;
+};
+
+static struct cma_dev_port_group *to_dev_port_group(struct config_item *item)
+{
+       struct config_group *group;
+
+       if (!item)
+               return NULL;
+
+       group = container_of(item, struct config_group, cg_item);
+       return container_of(group, struct cma_dev_port_group, group);
+}
+
+static bool filter_by_name(struct ib_device *ib_dev, void *cookie)
+{
+       return !strcmp(ib_dev->name, cookie);
+}
+
+static int cma_configfs_params_get(struct config_item *item,
+                                  struct cma_device **pcma_dev,
+                                  struct cma_dev_port_group **pgroup)
+{
+       struct cma_dev_port_group *group = to_dev_port_group(item);
+       struct cma_device *cma_dev;
+
+       if (!group)
+               return -ENODEV;
+
+       cma_dev = cma_enum_devices_by_ibdev(filter_by_name,
+                                           group->cma_dev_group->name);
+       if (!cma_dev)
+               return -ENODEV;
+
+       *pcma_dev = cma_dev;
+       *pgroup = group;
+
+       return 0;
+}
+
+static void cma_configfs_params_put(struct cma_device *cma_dev)
+{
+       cma_deref_dev(cma_dev);
+}
+
+static ssize_t default_roce_mode_show(struct config_item *item,
+                                     char *buf)
+{
+       struct cma_device *cma_dev;
+       struct cma_dev_port_group *group;
+       int gid_type;
+       ssize_t ret;
+
+       ret = cma_configfs_params_get(item, &cma_dev, &group);
+       if (ret)
+               return ret;
+
+       gid_type = cma_get_default_gid_type(cma_dev, group->port_num);
+       cma_configfs_params_put(cma_dev);
+
+       if (gid_type < 0)
+               return gid_type;
+
+       return sprintf(buf, "%s\n", ib_cache_gid_type_str(gid_type));
+}
+
+static ssize_t default_roce_mode_store(struct config_item *item,
+                                      const char *buf, size_t count)
+{
+       struct cma_device *cma_dev;
+       struct cma_dev_port_group *group;
+       int gid_type = ib_cache_gid_parse_type_str(buf);
+       ssize_t ret;
+
+       if (gid_type < 0)
+               return -EINVAL;
+
+       ret = cma_configfs_params_get(item, &cma_dev, &group);
+       if (ret)
+               return ret;
+
+       ret = cma_set_default_gid_type(cma_dev, group->port_num, gid_type);
+
+       cma_configfs_params_put(cma_dev);
+
+       return !ret ? strnlen(buf, count) : ret;
+}
+
+CONFIGFS_ATTR(, default_roce_mode);
+
+static struct configfs_attribute *cma_configfs_attributes[] = {
+       &attr_default_roce_mode,
+       NULL,
+};
+
+static struct config_item_type cma_port_group_type = {
+       .ct_attrs       = cma_configfs_attributes,
+       .ct_owner       = THIS_MODULE
+};
+
+static int make_cma_ports(struct cma_dev_group *cma_dev_group,
+                         struct cma_device *cma_dev)
+{
+       struct ib_device *ibdev;
+       unsigned int i;
+       unsigned int ports_num;
+       struct cma_dev_port_group *ports;
+       struct config_group **ports_group;
+       int err;
+
+       ibdev = cma_get_ib_dev(cma_dev);
+
+       if (!ibdev)
+               return -ENODEV;
+
+       ports_num = ibdev->phys_port_cnt;
+       ports = kcalloc(ports_num, sizeof(*cma_dev_group->ports),
+                       GFP_KERNEL);
+       ports_group = kcalloc(ports_num + 1, sizeof(*ports_group), GFP_KERNEL);
+
+       if (!ports || !ports_group) {
+               err = -ENOMEM;
+               goto free;
+       }
+
+       for (i = 0; i < ports_num; i++) {
+               char port_str[10];
+
+               ports[i].port_num = i + 1;
+               snprintf(port_str, sizeof(port_str), "%u", i + 1);
+               ports[i].cma_dev_group = cma_dev_group;
+               config_group_init_type_name(&ports[i].group,
+                                           port_str,
+                                           &cma_port_group_type);
+               ports_group[i] = &ports[i].group;
+       }
+       ports_group[i] = NULL;
+       cma_dev_group->default_ports_group = ports_group;
+       cma_dev_group->ports = ports;
+
+       return 0;
+free:
+       kfree(ports);
+       kfree(ports_group);
+       cma_dev_group->ports = NULL;
+       cma_dev_group->default_ports_group = NULL;
+       return err;
+}
+
+static void release_cma_dev(struct config_item  *item)
+{
+       struct config_group *group = container_of(item, struct config_group,
+                                                 cg_item);
+       struct cma_dev_group *cma_dev_group = container_of(group,
+                                                          struct cma_dev_group,
+                                                          device_group);
+
+       kfree(cma_dev_group);
+};
+
+static void release_cma_ports_group(struct config_item  *item)
+{
+       struct config_group *group = container_of(item, struct config_group,
+                                                 cg_item);
+       struct cma_dev_group *cma_dev_group = container_of(group,
+                                                          struct cma_dev_group,
+                                                          ports_group);
+
+       kfree(cma_dev_group->ports);
+       kfree(cma_dev_group->default_ports_group);
+       cma_dev_group->ports = NULL;
+       cma_dev_group->default_ports_group = NULL;
+};
+
+static struct configfs_item_operations cma_ports_item_ops = {
+       .release = release_cma_ports_group
+};
+
+static struct config_item_type cma_ports_group_type = {
+       .ct_item_ops    = &cma_ports_item_ops,
+       .ct_owner       = THIS_MODULE
+};
+
+static struct configfs_item_operations cma_device_item_ops = {
+       .release = release_cma_dev
+};
+
+static struct config_item_type cma_device_group_type = {
+       .ct_item_ops    = &cma_device_item_ops,
+       .ct_owner       = THIS_MODULE
+};
+
+static struct config_group *make_cma_dev(struct config_group *group,
+                                        const char *name)
+{
+       int err = -ENODEV;
+       struct cma_device *cma_dev = cma_enum_devices_by_ibdev(filter_by_name,
+                                                              (void *)name);
+       struct cma_dev_group *cma_dev_group = NULL;
+
+       if (!cma_dev)
+               goto fail;
+
+       cma_dev_group = kzalloc(sizeof(*cma_dev_group), GFP_KERNEL);
+
+       if (!cma_dev_group) {
+               err = -ENOMEM;
+               goto fail;
+       }
+
+       strncpy(cma_dev_group->name, name, sizeof(cma_dev_group->name));
+
+       err = make_cma_ports(cma_dev_group, cma_dev);
+       if (err)
+               goto fail;
+
+       cma_dev_group->ports_group.default_groups =
+               cma_dev_group->default_ports_group;
+       config_group_init_type_name(&cma_dev_group->ports_group, "ports",
+                                   &cma_ports_group_type);
+
+       cma_dev_group->device_group.default_groups
+               = cma_dev_group->default_dev_group;
+       cma_dev_group->default_dev_group[0] = &cma_dev_group->ports_group;
+       cma_dev_group->default_dev_group[1] = NULL;
+
+       config_group_init_type_name(&cma_dev_group->device_group, name,
+                                   &cma_device_group_type);
+
+       cma_deref_dev(cma_dev);
+       return &cma_dev_group->device_group;
+
+fail:
+       if (cma_dev)
+               cma_deref_dev(cma_dev);
+       kfree(cma_dev_group);
+       return ERR_PTR(err);
+}
+
+static struct configfs_group_operations cma_subsys_group_ops = {
+       .make_group     = make_cma_dev,
+};
+
+static struct config_item_type cma_subsys_type = {
+       .ct_group_ops   = &cma_subsys_group_ops,
+       .ct_owner       = THIS_MODULE,
+};
+
+static struct configfs_subsystem cma_subsys = {
+       .su_group       = {
+               .cg_item        = {
+                       .ci_namebuf     = "rdma_cm",
+                       .ci_type        = &cma_subsys_type,
+               },
+       },
+};
+
+int __init cma_configfs_init(void)
+{
+       config_group_init(&cma_subsys.su_group);
+       mutex_init(&cma_subsys.su_mutex);
+       return configfs_register_subsystem(&cma_subsys);
+}
+
+void __exit cma_configfs_exit(void)
+{
+       configfs_unregister_subsystem(&cma_subsys);
+}
index 5cf6eb716f000a7aa07233419a0dcbbce41e0b6d..eab32215756b935159355e22c8d9bf76f108f873 100644 (file)
 
 #include <rdma/ib_verbs.h>
 
+#if IS_ENABLED(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS)
+int cma_configfs_init(void);
+void cma_configfs_exit(void);
+#else
+static inline int cma_configfs_init(void)
+{
+       return 0;
+}
+
+static inline void cma_configfs_exit(void)
+{
+}
+#endif
+struct cma_device;
+void cma_ref_dev(struct cma_device *cma_dev);
+void cma_deref_dev(struct cma_device *cma_dev);
+typedef bool (*cma_device_filter)(struct ib_device *, void *);
+struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter,
+                                            void               *cookie);
+int cma_get_default_gid_type(struct cma_device *cma_dev,
+                            unsigned int port);
+int cma_set_default_gid_type(struct cma_device *cma_dev,
+                            unsigned int port,
+                            enum ib_gid_type default_gid_type);
+struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev);
+
 int  ib_device_register_sysfs(struct ib_device *device,
                              int (*port_callback)(struct ib_device *,
                                                   u8, struct kobject *));
@@ -70,8 +96,13 @@ enum ib_cache_gid_default_mode {
        IB_CACHE_GID_DEFAULT_MODE_DELETE
 };
 
+int ib_cache_gid_parse_type_str(const char *buf);
+
+const char *ib_cache_gid_type_str(enum ib_gid_type gid_type);
+
 void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port,
                                  struct net_device *ndev,
+                                 unsigned long gid_type_mask,
                                  enum ib_cache_gid_default_mode mode);
 
 int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
@@ -87,9 +118,23 @@ int roce_gid_mgmt_init(void);
 void roce_gid_mgmt_cleanup(void);
 
 int roce_rescan_device(struct ib_device *ib_dev);
+unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port);
 
 int ib_cache_setup_one(struct ib_device *device);
 void ib_cache_cleanup_one(struct ib_device *device);
 void ib_cache_release_one(struct ib_device *device);
 
+static inline bool rdma_is_upper_dev_rcu(struct net_device *dev,
+                                        struct net_device *upper)
+{
+       struct net_device *_upper = NULL;
+       struct list_head *iter;
+
+       netdev_for_each_all_upper_dev_rcu(dev, _upper, iter)
+               if (_upper == upper)
+                       break;
+
+       return _upper == upper;
+}
+
 #endif /* _CORE_PRIV_H */
diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
new file mode 100644 (file)
index 0000000..a754fc7
--- /dev/null
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2015 HGST, a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <rdma/ib_verbs.h>
+
+/* # of WCs to poll for with a single call to ib_poll_cq */
+#define IB_POLL_BATCH                  16
+
+/* # of WCs to iterate over before yielding */
+#define IB_POLL_BUDGET_IRQ             256
+#define IB_POLL_BUDGET_WORKQUEUE       65536
+
+#define IB_POLL_FLAGS \
+       (IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)
+
+static int __ib_process_cq(struct ib_cq *cq, int budget)
+{
+       int i, n, completed = 0;
+
+       while ((n = ib_poll_cq(cq, IB_POLL_BATCH, cq->wc)) > 0) {
+               for (i = 0; i < n; i++) {
+                       struct ib_wc *wc = &cq->wc[i];
+
+                       if (wc->wr_cqe)
+                               wc->wr_cqe->done(cq, wc);
+                       else
+                               WARN_ON_ONCE(wc->status == IB_WC_SUCCESS);
+               }
+
+               completed += n;
+
+               if (n != IB_POLL_BATCH ||
+                   (budget != -1 && completed >= budget))
+                       break;
+       }
+
+       return completed;
+}
+
+/**
+ * ib_process_direct_cq - process a CQ in caller context
+ * @cq:                CQ to process
+ * @budget:    number of CQEs to poll for
+ *
+ * This function is used to process all outstanding CQ entries on a
+ * %IB_POLL_DIRECT CQ.  It does not offload CQ processing to a different
+ * context and does not ask for completion interrupts from the HCA.
+ *
+ * Note: for compatibility reasons -1 can be passed in %budget for unlimited
+ * polling.  Do not use this feature in new code, it will be removed soon.
+ */
+int ib_process_cq_direct(struct ib_cq *cq, int budget)
+{
+       WARN_ON_ONCE(cq->poll_ctx != IB_POLL_DIRECT);
+
+       return __ib_process_cq(cq, budget);
+}
+EXPORT_SYMBOL(ib_process_cq_direct);
+
+static void ib_cq_completion_direct(struct ib_cq *cq, void *private)
+{
+       WARN_ONCE(1, "got unsolicited completion for CQ 0x%p\n", cq);
+}
+
+static int ib_poll_handler(struct irq_poll *iop, int budget)
+{
+       struct ib_cq *cq = container_of(iop, struct ib_cq, iop);
+       int completed;
+
+       completed = __ib_process_cq(cq, budget);
+       if (completed < budget) {
+               irq_poll_complete(&cq->iop);
+               if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
+                       irq_poll_sched(&cq->iop);
+       }
+
+       return completed;
+}
+
+static void ib_cq_completion_softirq(struct ib_cq *cq, void *private)
+{
+       irq_poll_sched(&cq->iop);
+}
+
+static void ib_cq_poll_work(struct work_struct *work)
+{
+       struct ib_cq *cq = container_of(work, struct ib_cq, work);
+       int completed;
+
+       completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE);
+       if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
+           ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
+               queue_work(ib_comp_wq, &cq->work);
+}
+
+static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
+{
+       queue_work(ib_comp_wq, &cq->work);
+}
+
+/**
+ * ib_alloc_cq - allocate a completion queue
+ * @dev:               device to allocate the CQ for
+ * @private:           driver private data, accessible from cq->cq_context
+ * @nr_cqe:            number of CQEs to allocate
+ * @comp_vector:       HCA completion vectors for this CQ
+ * @poll_ctx:          context to poll the CQ from.
+ *
+ * This is the proper interface to allocate a CQ for in-kernel users. A
+ * CQ allocated with this interface will automatically be polled from the
+ * specified context.  The ULP needs must use wr->wr_cqe instead of wr->wr_id
+ * to use this CQ abstraction.
+ */
+struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private,
+               int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx)
+{
+       struct ib_cq_init_attr cq_attr = {
+               .cqe            = nr_cqe,
+               .comp_vector    = comp_vector,
+       };
+       struct ib_cq *cq;
+       int ret = -ENOMEM;
+
+       cq = dev->create_cq(dev, &cq_attr, NULL, NULL);
+       if (IS_ERR(cq))
+               return cq;
+
+       cq->device = dev;
+       cq->uobject = NULL;
+       cq->event_handler = NULL;
+       cq->cq_context = private;
+       cq->poll_ctx = poll_ctx;
+       atomic_set(&cq->usecnt, 0);
+
+       cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL);
+       if (!cq->wc)
+               goto out_destroy_cq;
+
+       switch (cq->poll_ctx) {
+       case IB_POLL_DIRECT:
+               cq->comp_handler = ib_cq_completion_direct;
+               break;
+       case IB_POLL_SOFTIRQ:
+               cq->comp_handler = ib_cq_completion_softirq;
+
+               irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, ib_poll_handler);
+               ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+               break;
+       case IB_POLL_WORKQUEUE:
+               cq->comp_handler = ib_cq_completion_workqueue;
+               INIT_WORK(&cq->work, ib_cq_poll_work);
+               ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+               break;
+       default:
+               ret = -EINVAL;
+               goto out_free_wc;
+       }
+
+       return cq;
+
+out_free_wc:
+       kfree(cq->wc);
+out_destroy_cq:
+       cq->device->destroy_cq(cq);
+       return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(ib_alloc_cq);
+
+/**
+ * ib_free_cq - free a completion queue
+ * @cq:                completion queue to free.
+ */
+void ib_free_cq(struct ib_cq *cq)
+{
+       int ret;
+
+       if (WARN_ON_ONCE(atomic_read(&cq->usecnt)))
+               return;
+
+       switch (cq->poll_ctx) {
+       case IB_POLL_DIRECT:
+               break;
+       case IB_POLL_SOFTIRQ:
+               irq_poll_disable(&cq->iop);
+               break;
+       case IB_POLL_WORKQUEUE:
+               flush_work(&cq->work);
+               break;
+       default:
+               WARN_ON_ONCE(1);
+       }
+
+       kfree(cq->wc);
+       ret = cq->device->destroy_cq(cq);
+       WARN_ON_ONCE(ret);
+}
+EXPORT_SYMBOL(ib_free_cq);
index 179e8134d57fc13b425254d5162006f9fc201879..00da80e02154205c428ca00702f4b7c10fa5a829 100644 (file)
@@ -58,6 +58,7 @@ struct ib_client_data {
        bool              going_down;
 };
 
+struct workqueue_struct *ib_comp_wq;
 struct workqueue_struct *ib_wq;
 EXPORT_SYMBOL_GPL(ib_wq);
 
@@ -325,6 +326,7 @@ int ib_register_device(struct ib_device *device,
 {
        int ret;
        struct ib_client *client;
+       struct ib_udata uhw = {.outlen = 0, .inlen = 0};
 
        mutex_lock(&device_mutex);
 
@@ -352,6 +354,13 @@ int ib_register_device(struct ib_device *device,
                goto out;
        }
 
+       memset(&device->attrs, 0, sizeof(device->attrs));
+       ret = device->query_device(device, &device->attrs, &uhw);
+       if (ret) {
+               printk(KERN_WARNING "Couldn't query the device attributes\n");
+               goto out;
+       }
+
        ret = ib_device_register_sysfs(device, port_callback);
        if (ret) {
                printk(KERN_WARNING "Couldn't register device %s with driver model\n",
@@ -627,25 +636,6 @@ void ib_dispatch_event(struct ib_event *event)
 }
 EXPORT_SYMBOL(ib_dispatch_event);
 
-/**
- * ib_query_device - Query IB device attributes
- * @device:Device to query
- * @device_attr:Device attributes
- *
- * ib_query_device() returns the attributes of a device through the
- * @device_attr pointer.
- */
-int ib_query_device(struct ib_device *device,
-                   struct ib_device_attr *device_attr)
-{
-       struct ib_udata uhw = {.outlen = 0, .inlen = 0};
-
-       memset(device_attr, 0, sizeof(*device_attr));
-
-       return device->query_device(device, device_attr, &uhw);
-}
-EXPORT_SYMBOL(ib_query_device);
-
 /**
  * ib_query_port - Query IB port attributes
  * @device:Device to query
@@ -825,26 +815,31 @@ EXPORT_SYMBOL(ib_modify_port);
  *   a specified GID value occurs.
  * @device: The device to query.
  * @gid: The GID value to search for.
+ * @gid_type: Type of GID.
  * @ndev: The ndev related to the GID to search for.
  * @port_num: The port number of the device where the GID value was found.
  * @index: The index into the GID table where the GID was found.  This
  *   parameter may be NULL.
  */
 int ib_find_gid(struct ib_device *device, union ib_gid *gid,
-               struct net_device *ndev, u8 *port_num, u16 *index)
+               enum ib_gid_type gid_type, struct net_device *ndev,
+               u8 *port_num, u16 *index)
 {
        union ib_gid tmp_gid;
        int ret, port, i;
 
        for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) {
                if (rdma_cap_roce_gid_table(device, port)) {
-                       if (!ib_find_cached_gid_by_port(device, gid, port,
+                       if (!ib_find_cached_gid_by_port(device, gid, gid_type, port,
                                                        ndev, index)) {
                                *port_num = port;
                                return 0;
                        }
                }
 
+               if (gid_type != IB_GID_TYPE_IB)
+                       continue;
+
                for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) {
                        ret = ib_query_gid(device, port, i, &tmp_gid, NULL);
                        if (ret)
@@ -954,10 +949,18 @@ static int __init ib_core_init(void)
        if (!ib_wq)
                return -ENOMEM;
 
+       ib_comp_wq = alloc_workqueue("ib-comp-wq",
+                       WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM,
+                       WQ_UNBOUND_MAX_ACTIVE);
+       if (!ib_comp_wq) {
+               ret = -ENOMEM;
+               goto err;
+       }
+
        ret = class_register(&ib_class);
        if (ret) {
                printk(KERN_WARNING "Couldn't create InfiniBand device class\n");
-               goto err;
+               goto err_comp;
        }
 
        ret = ibnl_init();
@@ -972,7 +975,8 @@ static int __init ib_core_init(void)
 
 err_sysfs:
        class_unregister(&ib_class);
-
+err_comp:
+       destroy_workqueue(ib_comp_wq);
 err:
        destroy_workqueue(ib_wq);
        return ret;
@@ -983,6 +987,7 @@ static void __exit ib_core_cleanup(void)
        ib_cache_cleanup();
        ibnl_cleanup();
        class_unregister(&ib_class);
+       destroy_workqueue(ib_comp_wq);
        /* Make sure that any pending umem accounting work is done. */
        destroy_workqueue(ib_wq);
 }
index 9f5ad7cc33c89985fb1c490c27f07766e38e29fa..6ac3683c144ba859ff34709cc292e2fc3208f621 100644 (file)
@@ -212,7 +212,6 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd             *pd,
 {
        struct ib_device   *device;
        struct ib_fmr_pool *pool;
-       struct ib_device_attr *attr;
        int i;
        int ret;
        int max_remaps;
@@ -228,25 +227,10 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd             *pd,
                return ERR_PTR(-ENOSYS);
        }
 
-       attr = kmalloc(sizeof *attr, GFP_KERNEL);
-       if (!attr) {
-               printk(KERN_WARNING PFX "couldn't allocate device attr struct\n");
-               return ERR_PTR(-ENOMEM);
-       }
-
-       ret = ib_query_device(device, attr);
-       if (ret) {
-               printk(KERN_WARNING PFX "couldn't query device: %d\n", ret);
-               kfree(attr);
-               return ERR_PTR(ret);
-       }
-
-       if (!attr->max_map_per_fmr)
+       if (!device->attrs.max_map_per_fmr)
                max_remaps = IB_FMR_MAX_REMAPS;
        else
-               max_remaps = attr->max_map_per_fmr;
-
-       kfree(attr);
+               max_remaps = device->attrs.max_map_per_fmr;
 
        pool = kmalloc(sizeof *pool, GFP_KERNEL);
        if (!pool) {
index 2281de122038e45a5c375f7d7669111ab2aee2de..9fa5bf33f5a34261b16a72c9800b55daab921c84 100644 (file)
@@ -84,6 +84,9 @@ static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req,
                              u8 mgmt_class);
 static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req,
                           struct ib_mad_agent_private *agent_priv);
+static bool ib_mad_send_error(struct ib_mad_port_private *port_priv,
+                             struct ib_wc *wc);
+static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc);
 
 /*
  * Returns a ib_mad_port_private structure or NULL for a device/port
@@ -681,7 +684,7 @@ static void snoop_recv(struct ib_mad_qp_info *qp_info,
 
                atomic_inc(&mad_snoop_priv->refcount);
                spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
-               mad_snoop_priv->agent.recv_handler(&mad_snoop_priv->agent,
+               mad_snoop_priv->agent.recv_handler(&mad_snoop_priv->agent, NULL,
                                                   mad_recv_wc);
                deref_snoop_agent(mad_snoop_priv);
                spin_lock_irqsave(&qp_info->snoop_lock, flags);
@@ -689,12 +692,11 @@ static void snoop_recv(struct ib_mad_qp_info *qp_info,
        spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
 }
 
-static void build_smp_wc(struct ib_qp *qp,
-                        u64 wr_id, u16 slid, u16 pkey_index, u8 port_num,
-                        struct ib_wc *wc)
+static void build_smp_wc(struct ib_qp *qp, struct ib_cqe *cqe, u16 slid,
+               u16 pkey_index, u8 port_num, struct ib_wc *wc)
 {
        memset(wc, 0, sizeof *wc);
-       wc->wr_id = wr_id;
+       wc->wr_cqe = cqe;
        wc->status = IB_WC_SUCCESS;
        wc->opcode = IB_WC_RECV;
        wc->pkey_index = pkey_index;
@@ -832,7 +834,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
        }
 
        build_smp_wc(mad_agent_priv->agent.qp,
-                    send_wr->wr.wr_id, drslid,
+                    send_wr->wr.wr_cqe, drslid,
                     send_wr->pkey_index,
                     send_wr->port_num, &mad_wc);
 
@@ -1039,7 +1041,9 @@ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
 
        mad_send_wr->sg_list[1].lkey = mad_agent->qp->pd->local_dma_lkey;
 
-       mad_send_wr->send_wr.wr.wr_id = (unsigned long) mad_send_wr;
+       mad_send_wr->mad_list.cqe.done = ib_mad_send_done;
+
+       mad_send_wr->send_wr.wr.wr_cqe = &mad_send_wr->mad_list.cqe;
        mad_send_wr->send_wr.wr.sg_list = mad_send_wr->sg_list;
        mad_send_wr->send_wr.wr.num_sge = 2;
        mad_send_wr->send_wr.wr.opcode = IB_WR_SEND;
@@ -1151,8 +1155,9 @@ int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr)
 
        /* Set WR ID to find mad_send_wr upon completion */
        qp_info = mad_send_wr->mad_agent_priv->qp_info;
-       mad_send_wr->send_wr.wr.wr_id = (unsigned long)&mad_send_wr->mad_list;
        mad_send_wr->mad_list.mad_queue = &qp_info->send_queue;
+       mad_send_wr->mad_list.cqe.done = ib_mad_send_done;
+       mad_send_wr->send_wr.wr.wr_cqe = &mad_send_wr->mad_list.cqe;
 
        mad_agent = mad_send_wr->send_buf.mad_agent;
        sge = mad_send_wr->sg_list;
@@ -1982,9 +1987,9 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
                                /* user rmpp is in effect
                                 * and this is an active RMPP MAD
                                 */
-                               mad_recv_wc->wc->wr_id = 0;
-                               mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
-                                                                  mad_recv_wc);
+                               mad_agent_priv->agent.recv_handler(
+                                               &mad_agent_priv->agent, NULL,
+                                               mad_recv_wc);
                                atomic_dec(&mad_agent_priv->refcount);
                        } else {
                                /* not user rmpp, revert to normal behavior and
@@ -1998,9 +2003,10 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
                        spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
 
                        /* Defined behavior is to complete response before request */
-                       mad_recv_wc->wc->wr_id = (unsigned long) &mad_send_wr->send_buf;
-                       mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
-                                                          mad_recv_wc);
+                       mad_agent_priv->agent.recv_handler(
+                                       &mad_agent_priv->agent,
+                                       &mad_send_wr->send_buf,
+                                       mad_recv_wc);
                        atomic_dec(&mad_agent_priv->refcount);
 
                        mad_send_wc.status = IB_WC_SUCCESS;
@@ -2009,7 +2015,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
                        ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
                }
        } else {
-               mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
+               mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, NULL,
                                                   mad_recv_wc);
                deref_mad_agent(mad_agent_priv);
        }
@@ -2172,13 +2178,14 @@ handle_smi(struct ib_mad_port_private *port_priv,
        return handle_ib_smi(port_priv, qp_info, wc, port_num, recv, response);
 }
 
-static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv,
-                                    struct ib_wc *wc)
+static void ib_mad_recv_done(struct ib_cq *cq, struct ib_wc *wc)
 {
+       struct ib_mad_port_private *port_priv = cq->cq_context;
+       struct ib_mad_list_head *mad_list =
+               container_of(wc->wr_cqe, struct ib_mad_list_head, cqe);
        struct ib_mad_qp_info *qp_info;
        struct ib_mad_private_header *mad_priv_hdr;
        struct ib_mad_private *recv, *response = NULL;
-       struct ib_mad_list_head *mad_list;
        struct ib_mad_agent_private *mad_agent;
        int port_num;
        int ret = IB_MAD_RESULT_SUCCESS;
@@ -2186,7 +2193,17 @@ static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv,
        u16 resp_mad_pkey_index = 0;
        bool opa;
 
-       mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
+       if (list_empty_careful(&port_priv->port_list))
+               return;
+
+       if (wc->status != IB_WC_SUCCESS) {
+               /*
+                * Receive errors indicate that the QP has entered the error
+                * state - error handling/shutdown code will cleanup
+                */
+               return;
+       }
+
        qp_info = mad_list->mad_queue->qp_info;
        dequeue_mad(mad_list);
 
@@ -2227,7 +2244,7 @@ static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv,
        response = alloc_mad_private(mad_size, GFP_KERNEL);
        if (!response) {
                dev_err(&port_priv->device->dev,
-                       "ib_mad_recv_done_handler no memory for response buffer\n");
+                       "%s: no memory for response buffer\n", __func__);
                goto out;
        }
 
@@ -2413,11 +2430,12 @@ done:
        spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
 }
 
-static void ib_mad_send_done_handler(struct ib_mad_port_private *port_priv,
-                                    struct ib_wc *wc)
+static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc)
 {
+       struct ib_mad_port_private *port_priv = cq->cq_context;
+       struct ib_mad_list_head *mad_list =
+               container_of(wc->wr_cqe, struct ib_mad_list_head, cqe);
        struct ib_mad_send_wr_private   *mad_send_wr, *queued_send_wr;
-       struct ib_mad_list_head         *mad_list;
        struct ib_mad_qp_info           *qp_info;
        struct ib_mad_queue             *send_queue;
        struct ib_send_wr               *bad_send_wr;
@@ -2425,7 +2443,14 @@ static void ib_mad_send_done_handler(struct ib_mad_port_private *port_priv,
        unsigned long flags;
        int ret;
 
-       mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
+       if (list_empty_careful(&port_priv->port_list))
+               return;
+
+       if (wc->status != IB_WC_SUCCESS) {
+               if (!ib_mad_send_error(port_priv, wc))
+                       return;
+       }
+
        mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private,
                                   mad_list);
        send_queue = mad_list->mad_queue;
@@ -2490,24 +2515,15 @@ static void mark_sends_for_retry(struct ib_mad_qp_info *qp_info)
        spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
 }
 
-static void mad_error_handler(struct ib_mad_port_private *port_priv,
-                             struct ib_wc *wc)
+static bool ib_mad_send_error(struct ib_mad_port_private *port_priv,
+               struct ib_wc *wc)
 {
-       struct ib_mad_list_head *mad_list;
-       struct ib_mad_qp_info *qp_info;
+       struct ib_mad_list_head *mad_list =
+               container_of(wc->wr_cqe, struct ib_mad_list_head, cqe);
+       struct ib_mad_qp_info *qp_info = mad_list->mad_queue->qp_info;
        struct ib_mad_send_wr_private *mad_send_wr;
        int ret;
 
-       /* Determine if failure was a send or receive */
-       mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
-       qp_info = mad_list->mad_queue->qp_info;
-       if (mad_list->mad_queue == &qp_info->recv_queue)
-               /*
-                * Receive errors indicate that the QP has entered the error
-                * state - error handling/shutdown code will cleanup
-                */
-               return;
-
        /*
         * Send errors will transition the QP to SQE - move
         * QP to RTS and repost flushed work requests
@@ -2522,10 +2538,9 @@ static void mad_error_handler(struct ib_mad_port_private *port_priv,
                        mad_send_wr->retry = 0;
                        ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr.wr,
                                        &bad_send_wr);
-                       if (ret)
-                               ib_mad_send_done_handler(port_priv, wc);
-               } else
-                       ib_mad_send_done_handler(port_priv, wc);
+                       if (!ret)
+                               return false;
+               }
        } else {
                struct ib_qp_attr *attr;
 
@@ -2539,42 +2554,14 @@ static void mad_error_handler(struct ib_mad_port_private *port_priv,
                        kfree(attr);
                        if (ret)
                                dev_err(&port_priv->device->dev,
-                                       "mad_error_handler - ib_modify_qp to RTS : %d\n",
-                                       ret);
+                                       "%s - ib_modify_qp to RTS: %d\n",
+                                       __func__, ret);
                        else
                                mark_sends_for_retry(qp_info);
                }
-               ib_mad_send_done_handler(port_priv, wc);
        }
-}
 
-/*
- * IB MAD completion callback
- */
-static void ib_mad_completion_handler(struct work_struct *work)
-{
-       struct ib_mad_port_private *port_priv;
-       struct ib_wc wc;
-
-       port_priv = container_of(work, struct ib_mad_port_private, work);
-       ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP);
-
-       while (ib_poll_cq(port_priv->cq, 1, &wc) == 1) {
-               if (wc.status == IB_WC_SUCCESS) {
-                       switch (wc.opcode) {
-                       case IB_WC_SEND:
-                               ib_mad_send_done_handler(port_priv, &wc);
-                               break;
-                       case IB_WC_RECV:
-                               ib_mad_recv_done_handler(port_priv, &wc);
-                               break;
-                       default:
-                               BUG_ON(1);
-                               break;
-                       }
-               } else
-                       mad_error_handler(port_priv, &wc);
-       }
+       return true;
 }
 
 static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv)
@@ -2716,7 +2703,7 @@ static void local_completions(struct work_struct *work)
                         * before request
                         */
                        build_smp_wc(recv_mad_agent->agent.qp,
-                                    (unsigned long) local->mad_send_wr,
+                                    local->mad_send_wr->send_wr.wr.wr_cqe,
                                     be16_to_cpu(IB_LID_PERMISSIVE),
                                     local->mad_send_wr->send_wr.pkey_index,
                                     recv_mad_agent->agent.port_num, &wc);
@@ -2744,6 +2731,7 @@ static void local_completions(struct work_struct *work)
                                           IB_MAD_SNOOP_RECVS);
                        recv_mad_agent->agent.recv_handler(
                                                &recv_mad_agent->agent,
+                                               &local->mad_send_wr->send_buf,
                                                &local->mad_priv->header.recv_wc);
                        spin_lock_irqsave(&recv_mad_agent->lock, flags);
                        atomic_dec(&recv_mad_agent->refcount);
@@ -2855,17 +2843,6 @@ static void timeout_sends(struct work_struct *work)
        spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
 }
 
-static void ib_mad_thread_completion_handler(struct ib_cq *cq, void *arg)
-{
-       struct ib_mad_port_private *port_priv = cq->cq_context;
-       unsigned long flags;
-
-       spin_lock_irqsave(&ib_mad_port_list_lock, flags);
-       if (!list_empty(&port_priv->port_list))
-               queue_work(port_priv->wq, &port_priv->work);
-       spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
-}
-
 /*
  * Allocate receive MADs and post receive WRs for them
  */
@@ -2913,8 +2890,9 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
                        break;
                }
                mad_priv->header.mapping = sg_list.addr;
-               recv_wr.wr_id = (unsigned long)&mad_priv->header.mad_list;
                mad_priv->header.mad_list.mad_queue = recv_queue;
+               mad_priv->header.mad_list.cqe.done = ib_mad_recv_done;
+               recv_wr.wr_cqe = &mad_priv->header.mad_list.cqe;
 
                /* Post receive WR */
                spin_lock_irqsave(&recv_queue->lock, flags);
@@ -3151,7 +3129,6 @@ static int ib_mad_port_open(struct ib_device *device,
        unsigned long flags;
        char name[sizeof "ib_mad123"];
        int has_smi;
-       struct ib_cq_init_attr cq_attr = {};
 
        if (WARN_ON(rdma_max_mad_size(device, port_num) < IB_MGMT_MAD_SIZE))
                return -EFAULT;
@@ -3179,10 +3156,8 @@ static int ib_mad_port_open(struct ib_device *device,
        if (has_smi)
                cq_size *= 2;
 
-       cq_attr.cqe = cq_size;
-       port_priv->cq = ib_create_cq(port_priv->device,
-                                    ib_mad_thread_completion_handler,
-                                    NULL, port_priv, &cq_attr);
+       port_priv->cq = ib_alloc_cq(port_priv->device, port_priv, cq_size, 0,
+                       IB_POLL_WORKQUEUE);
        if (IS_ERR(port_priv->cq)) {
                dev_err(&device->dev, "Couldn't create ib_mad CQ\n");
                ret = PTR_ERR(port_priv->cq);
@@ -3211,7 +3186,6 @@ static int ib_mad_port_open(struct ib_device *device,
                ret = -ENOMEM;
                goto error8;
        }
-       INIT_WORK(&port_priv->work, ib_mad_completion_handler);
 
        spin_lock_irqsave(&ib_mad_port_list_lock, flags);
        list_add_tail(&port_priv->port_list, &ib_mad_port_list);
@@ -3238,7 +3212,7 @@ error7:
 error6:
        ib_dealloc_pd(port_priv->pd);
 error4:
-       ib_destroy_cq(port_priv->cq);
+       ib_free_cq(port_priv->cq);
        cleanup_recv_queue(&port_priv->qp_info[1]);
        cleanup_recv_queue(&port_priv->qp_info[0]);
 error3:
@@ -3271,7 +3245,7 @@ static int ib_mad_port_close(struct ib_device *device, int port_num)
        destroy_mad_qp(&port_priv->qp_info[1]);
        destroy_mad_qp(&port_priv->qp_info[0]);
        ib_dealloc_pd(port_priv->pd);
-       ib_destroy_cq(port_priv->cq);
+       ib_free_cq(port_priv->cq);
        cleanup_recv_queue(&port_priv->qp_info[1]);
        cleanup_recv_queue(&port_priv->qp_info[0]);
        /* XXX: Handle deallocation of MAD registration tables */
index 990698a6ab4b7024116ae50c9417d00ce179b41c..28669f6419e1fba02b65a0d776c55b08d3cc4fbc 100644 (file)
@@ -64,6 +64,7 @@
 
 struct ib_mad_list_head {
        struct list_head list;
+       struct ib_cqe cqe;
        struct ib_mad_queue *mad_queue;
 };
 
@@ -204,7 +205,6 @@ struct ib_mad_port_private {
        struct ib_mad_mgmt_version_table version[MAX_MGMT_VERSION];
        struct list_head agent_list;
        struct workqueue_struct *wq;
-       struct work_struct work;
        struct ib_mad_qp_info qp_info[IB_MAD_QPS_CORE];
 };
 
index bb6685fb08c61483546505f99037b88af6202ad0..250937cb9a1a5071f054a24e74414a8c90bd88a0 100644 (file)
@@ -723,14 +723,27 @@ EXPORT_SYMBOL(ib_sa_get_mcmember_rec);
 
 int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num,
                             struct ib_sa_mcmember_rec *rec,
+                            struct net_device *ndev,
+                            enum ib_gid_type gid_type,
                             struct ib_ah_attr *ah_attr)
 {
        int ret;
        u16 gid_index;
        u8 p;
 
-       ret = ib_find_cached_gid(device, &rec->port_gid,
-                                NULL, &p, &gid_index);
+       if (rdma_protocol_roce(device, port_num)) {
+               ret = ib_find_cached_gid_by_port(device, &rec->port_gid,
+                                                gid_type, port_num,
+                                                ndev,
+                                                &gid_index);
+       } else if (rdma_protocol_ib(device, port_num)) {
+               ret = ib_find_cached_gid(device, &rec->port_gid,
+                                        IB_GID_TYPE_IB, NULL, &p,
+                                        &gid_index);
+       } else {
+               ret = -EINVAL;
+       }
+
        if (ret)
                return ret;
 
index 178f98482e13e217c16d447978a855a900808bc0..06556c34606deb38e8906d08327a7d024f38079e 100644 (file)
@@ -67,17 +67,53 @@ struct netdev_event_work {
        struct netdev_event_work_cmd    cmds[ROCE_NETDEV_CALLBACK_SZ];
 };
 
+static const struct {
+       bool (*is_supported)(const struct ib_device *device, u8 port_num);
+       enum ib_gid_type gid_type;
+} PORT_CAP_TO_GID_TYPE[] = {
+       {rdma_protocol_roce_eth_encap, IB_GID_TYPE_ROCE},
+       {rdma_protocol_roce_udp_encap, IB_GID_TYPE_ROCE_UDP_ENCAP},
+};
+
+#define CAP_TO_GID_TABLE_SIZE  ARRAY_SIZE(PORT_CAP_TO_GID_TYPE)
+
+unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port)
+{
+       int i;
+       unsigned int ret_flags = 0;
+
+       if (!rdma_protocol_roce(ib_dev, port))
+               return 1UL << IB_GID_TYPE_IB;
+
+       for (i = 0; i < CAP_TO_GID_TABLE_SIZE; i++)
+               if (PORT_CAP_TO_GID_TYPE[i].is_supported(ib_dev, port))
+                       ret_flags |= 1UL << PORT_CAP_TO_GID_TYPE[i].gid_type;
+
+       return ret_flags;
+}
+EXPORT_SYMBOL(roce_gid_type_mask_support);
+
 static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev,
                       u8 port, union ib_gid *gid,
                       struct ib_gid_attr *gid_attr)
 {
-       switch (gid_op) {
-       case GID_ADD:
-               ib_cache_gid_add(ib_dev, port, gid, gid_attr);
-               break;
-       case GID_DEL:
-               ib_cache_gid_del(ib_dev, port, gid, gid_attr);
-               break;
+       int i;
+       unsigned long gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
+
+       for (i = 0; i < IB_GID_TYPE_SIZE; i++) {
+               if ((1UL << i) & gid_type_mask) {
+                       gid_attr->gid_type = i;
+                       switch (gid_op) {
+                       case GID_ADD:
+                               ib_cache_gid_add(ib_dev, port,
+                                                gid, gid_attr);
+                               break;
+                       case GID_DEL:
+                               ib_cache_gid_del(ib_dev, port,
+                                                gid, gid_attr);
+                               break;
+                       }
+               }
        }
 }
 
@@ -103,18 +139,6 @@ static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_de
        return BONDING_SLAVE_STATE_NA;
 }
 
-static bool is_upper_dev_rcu(struct net_device *dev, struct net_device *upper)
-{
-       struct net_device *_upper = NULL;
-       struct list_head *iter;
-
-       netdev_for_each_all_upper_dev_rcu(dev, _upper, iter)
-               if (_upper == upper)
-                       break;
-
-       return _upper == upper;
-}
-
 #define REQUIRED_BOND_STATES           (BONDING_SLAVE_STATE_ACTIVE |   \
                                         BONDING_SLAVE_STATE_NA)
 static int is_eth_port_of_netdev(struct ib_device *ib_dev, u8 port,
@@ -132,7 +156,7 @@ static int is_eth_port_of_netdev(struct ib_device *ib_dev, u8 port,
        if (!real_dev)
                real_dev = event_ndev;
 
-       res = ((is_upper_dev_rcu(rdma_ndev, event_ndev) &&
+       res = ((rdma_is_upper_dev_rcu(rdma_ndev, event_ndev) &&
               (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) &
                REQUIRED_BOND_STATES)) ||
               real_dev == rdma_ndev);
@@ -178,7 +202,7 @@ static int upper_device_filter(struct ib_device *ib_dev, u8 port,
                return 1;
 
        rcu_read_lock();
-       res = is_upper_dev_rcu(rdma_ndev, event_ndev);
+       res = rdma_is_upper_dev_rcu(rdma_ndev, event_ndev);
        rcu_read_unlock();
 
        return res;
@@ -203,10 +227,12 @@ static void enum_netdev_default_gids(struct ib_device *ib_dev,
                                     u8 port, struct net_device *event_ndev,
                                     struct net_device *rdma_ndev)
 {
+       unsigned long gid_type_mask;
+
        rcu_read_lock();
        if (!rdma_ndev ||
            ((rdma_ndev != event_ndev &&
-             !is_upper_dev_rcu(rdma_ndev, event_ndev)) ||
+             !rdma_is_upper_dev_rcu(rdma_ndev, event_ndev)) ||
             is_eth_active_slave_of_bonding_rcu(rdma_ndev,
                                                netdev_master_upper_dev_get_rcu(rdma_ndev)) ==
             BONDING_SLAVE_STATE_INACTIVE)) {
@@ -215,7 +241,9 @@ static void enum_netdev_default_gids(struct ib_device *ib_dev,
        }
        rcu_read_unlock();
 
-       ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev,
+       gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
+
+       ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev, gid_type_mask,
                                     IB_CACHE_GID_DEFAULT_MODE_SET);
 }
 
@@ -234,12 +262,17 @@ static void bond_delete_netdev_default_gids(struct ib_device *ib_dev,
 
        rcu_read_lock();
 
-       if (is_upper_dev_rcu(rdma_ndev, event_ndev) &&
+       if (rdma_is_upper_dev_rcu(rdma_ndev, event_ndev) &&
            is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) ==
            BONDING_SLAVE_STATE_INACTIVE) {
+               unsigned long gid_type_mask;
+
                rcu_read_unlock();
 
+               gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
+
                ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev,
+                                            gid_type_mask,
                                             IB_CACHE_GID_DEFAULT_MODE_DELETE);
        } else {
                rcu_read_unlock();
index a95a32ba596edc03728cf7790a6752c951cde951..f334090bb6129bf7f3cfe788e5cff7bc762752c3 100644 (file)
@@ -49,7 +49,9 @@
 #include <net/netlink.h>
 #include <uapi/rdma/ib_user_sa.h>
 #include <rdma/ib_marshall.h>
+#include <rdma/ib_addr.h>
 #include "sa.h"
+#include "core_priv.h"
 
 MODULE_AUTHOR("Roland Dreier");
 MODULE_DESCRIPTION("InfiniBand subnet administration query support");
@@ -715,7 +717,9 @@ static int ib_nl_handle_set_timeout(struct sk_buff *skb,
        struct nlattr *tb[LS_NLA_TYPE_MAX];
        int ret;
 
-       if (!netlink_capable(skb, CAP_NET_ADMIN))
+       if (!(nlh->nlmsg_flags & NLM_F_REQUEST) ||
+           !(NETLINK_CB(skb).sk) ||
+           !netlink_capable(skb, CAP_NET_ADMIN))
                return -EPERM;
 
        ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
@@ -789,7 +793,9 @@ static int ib_nl_handle_resolve_resp(struct sk_buff *skb,
        int found = 0;
        int ret;
 
-       if (!netlink_capable(skb, CAP_NET_ADMIN))
+       if ((nlh->nlmsg_flags & NLM_F_REQUEST) ||
+           !(NETLINK_CB(skb).sk) ||
+           !netlink_capable(skb, CAP_NET_ADMIN))
                return -EPERM;
 
        spin_lock_irqsave(&ib_nl_request_lock, flags);
@@ -996,7 +1002,8 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
 {
        int ret;
        u16 gid_index;
-       int force_grh;
+       int use_roce;
+       struct net_device *ndev = NULL;
 
        memset(ah_attr, 0, sizeof *ah_attr);
        ah_attr->dlid = be16_to_cpu(rec->dlid);
@@ -1006,16 +1013,71 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
        ah_attr->port_num = port_num;
        ah_attr->static_rate = rec->rate;
 
-       force_grh = rdma_cap_eth_ah(device, port_num);
+       use_roce = rdma_cap_eth_ah(device, port_num);
+
+       if (use_roce) {
+               struct net_device *idev;
+               struct net_device *resolved_dev;
+               struct rdma_dev_addr dev_addr = {.bound_dev_if = rec->ifindex,
+                                                .net = rec->net ? rec->net :
+                                                        &init_net};
+               union {
+                       struct sockaddr     _sockaddr;
+                       struct sockaddr_in  _sockaddr_in;
+                       struct sockaddr_in6 _sockaddr_in6;
+               } sgid_addr, dgid_addr;
+
+               if (!device->get_netdev)
+                       return -EOPNOTSUPP;
+
+               rdma_gid2ip(&sgid_addr._sockaddr, &rec->sgid);
+               rdma_gid2ip(&dgid_addr._sockaddr, &rec->dgid);
+
+               /* validate the route */
+               ret = rdma_resolve_ip_route(&sgid_addr._sockaddr,
+                                           &dgid_addr._sockaddr, &dev_addr);
+               if (ret)
+                       return ret;
 
-       if (rec->hop_limit > 1 || force_grh) {
-               struct net_device *ndev = ib_get_ndev_from_path(rec);
+               if ((dev_addr.network == RDMA_NETWORK_IPV4 ||
+                    dev_addr.network == RDMA_NETWORK_IPV6) &&
+                   rec->gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP)
+                       return -EINVAL;
+
+               idev = device->get_netdev(device, port_num);
+               if (!idev)
+                       return -ENODEV;
+
+               resolved_dev = dev_get_by_index(dev_addr.net,
+                                               dev_addr.bound_dev_if);
+               if (resolved_dev->flags & IFF_LOOPBACK) {
+                       dev_put(resolved_dev);
+                       resolved_dev = idev;
+                       dev_hold(resolved_dev);
+               }
+               ndev = ib_get_ndev_from_path(rec);
+               rcu_read_lock();
+               if ((ndev && ndev != resolved_dev) ||
+                   (resolved_dev != idev &&
+                    !rdma_is_upper_dev_rcu(idev, resolved_dev)))
+                       ret = -EHOSTUNREACH;
+               rcu_read_unlock();
+               dev_put(idev);
+               dev_put(resolved_dev);
+               if (ret) {
+                       if (ndev)
+                               dev_put(ndev);
+                       return ret;
+               }
+       }
 
+       if (rec->hop_limit > 1 || use_roce) {
                ah_attr->ah_flags = IB_AH_GRH;
                ah_attr->grh.dgid = rec->dgid;
 
-               ret = ib_find_cached_gid(device, &rec->sgid, ndev, &port_num,
-                                        &gid_index);
+               ret = ib_find_cached_gid_by_port(device, &rec->sgid,
+                                                rec->gid_type, port_num, ndev,
+                                                &gid_index);
                if (ret) {
                        if (ndev)
                                dev_put(ndev);
@@ -1029,9 +1091,10 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
                if (ndev)
                        dev_put(ndev);
        }
-       if (force_grh) {
+
+       if (use_roce)
                memcpy(ah_attr->dmac, rec->dmac, ETH_ALEN);
-       }
+
        return 0;
 }
 EXPORT_SYMBOL(ib_init_ah_from_path);
@@ -1157,6 +1220,7 @@ static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query,
                          mad->data, &rec);
                rec.net = NULL;
                rec.ifindex = 0;
+               rec.gid_type = IB_GID_TYPE_IB;
                memset(rec.dmac, 0, ETH_ALEN);
                query->callback(status, &rec, query->context);
        } else
@@ -1609,14 +1673,15 @@ static void send_handler(struct ib_mad_agent *agent,
 }
 
 static void recv_handler(struct ib_mad_agent *mad_agent,
+                        struct ib_mad_send_buf *send_buf,
                         struct ib_mad_recv_wc *mad_recv_wc)
 {
        struct ib_sa_query *query;
-       struct ib_mad_send_buf *mad_buf;
 
-       mad_buf = (void *) (unsigned long) mad_recv_wc->wc->wr_id;
-       query = mad_buf->context[0];
+       if (!send_buf)
+               return;
 
+       query = send_buf->context[0];
        if (query->callback) {
                if (mad_recv_wc->wc->status == IB_WC_SUCCESS)
                        query->callback(query,
index b1f37d4095fa1e15f7402c35a367b00e1f24b6b5..3de93517efe43b097d7f58e37cfbe36bd8ab0f65 100644 (file)
 #include <linux/slab.h>
 #include <linux/stat.h>
 #include <linux/string.h>
+#include <linux/netdevice.h>
 
 #include <rdma/ib_mad.h>
+#include <rdma/ib_pma.h>
 
+struct ib_port;
+
+struct gid_attr_group {
+       struct ib_port          *port;
+       struct kobject          kobj;
+       struct attribute_group  ndev;
+       struct attribute_group  type;
+};
 struct ib_port {
        struct kobject         kobj;
        struct ib_device      *ibdev;
+       struct gid_attr_group *gid_attr_group;
        struct attribute_group gid_group;
        struct attribute_group pkey_group;
        u8                     port_num;
+       struct attribute_group *pma_table;
 };
 
 struct port_attribute {
@@ -65,6 +77,7 @@ struct port_table_attribute {
        struct port_attribute   attr;
        char                    name[8];
        int                     index;
+       __be16                  attr_id;
 };
 
 static ssize_t port_attr_show(struct kobject *kobj,
@@ -84,6 +97,24 @@ static const struct sysfs_ops port_sysfs_ops = {
        .show = port_attr_show
 };
 
+static ssize_t gid_attr_show(struct kobject *kobj,
+                            struct attribute *attr, char *buf)
+{
+       struct port_attribute *port_attr =
+               container_of(attr, struct port_attribute, attr);
+       struct ib_port *p = container_of(kobj, struct gid_attr_group,
+                                        kobj)->port;
+
+       if (!port_attr->show)
+               return -EIO;
+
+       return port_attr->show(p, port_attr, buf);
+}
+
+static const struct sysfs_ops gid_attr_sysfs_ops = {
+       .show = gid_attr_show
+};
+
 static ssize_t state_show(struct ib_port *p, struct port_attribute *unused,
                          char *buf)
 {
@@ -281,6 +312,46 @@ static struct attribute *port_default_attrs[] = {
        NULL
 };
 
+static size_t print_ndev(struct ib_gid_attr *gid_attr, char *buf)
+{
+       if (!gid_attr->ndev)
+               return -EINVAL;
+
+       return sprintf(buf, "%s\n", gid_attr->ndev->name);
+}
+
+static size_t print_gid_type(struct ib_gid_attr *gid_attr, char *buf)
+{
+       return sprintf(buf, "%s\n", ib_cache_gid_type_str(gid_attr->gid_type));
+}
+
+static ssize_t _show_port_gid_attr(struct ib_port *p,
+                                  struct port_attribute *attr,
+                                  char *buf,
+                                  size_t (*print)(struct ib_gid_attr *gid_attr,
+                                                  char *buf))
+{
+       struct port_table_attribute *tab_attr =
+               container_of(attr, struct port_table_attribute, attr);
+       union ib_gid gid;
+       struct ib_gid_attr gid_attr = {};
+       ssize_t ret;
+       va_list args;
+
+       ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid,
+                          &gid_attr);
+       if (ret)
+               goto err;
+
+       ret = print(&gid_attr, buf);
+
+err:
+       if (gid_attr.ndev)
+               dev_put(gid_attr.ndev);
+       va_end(args);
+       return ret;
+}
+
 static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr,
                             char *buf)
 {
@@ -296,6 +367,19 @@ static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr,
        return sprintf(buf, "%pI6\n", gid.raw);
 }
 
+static ssize_t show_port_gid_attr_ndev(struct ib_port *p,
+                                      struct port_attribute *attr, char *buf)
+{
+       return _show_port_gid_attr(p, attr, buf, print_ndev);
+}
+
+static ssize_t show_port_gid_attr_gid_type(struct ib_port *p,
+                                          struct port_attribute *attr,
+                                          char *buf)
+{
+       return _show_port_gid_attr(p, attr, buf, print_gid_type);
+}
+
 static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr,
                              char *buf)
 {
@@ -314,24 +398,32 @@ static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr,
 #define PORT_PMA_ATTR(_name, _counter, _width, _offset)                        \
 struct port_table_attribute port_pma_attr_##_name = {                  \
        .attr  = __ATTR(_name, S_IRUGO, show_pma_counter, NULL),        \
-       .index = (_offset) | ((_width) << 16) | ((_counter) << 24)      \
+       .index = (_offset) | ((_width) << 16) | ((_counter) << 24),     \
+       .attr_id = IB_PMA_PORT_COUNTERS ,                               \
 }
 
-static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr,
-                               char *buf)
+#define PORT_PMA_ATTR_EXT(_name, _width, _offset)                      \
+struct port_table_attribute port_pma_attr_ext_##_name = {              \
+       .attr  = __ATTR(_name, S_IRUGO, show_pma_counter, NULL),        \
+       .index = (_offset) | ((_width) << 16),                          \
+       .attr_id = IB_PMA_PORT_COUNTERS_EXT ,                           \
+}
+
+/*
+ * Get a Perfmgmt MAD block of data.
+ * Returns error code or the number of bytes retrieved.
+ */
+static int get_perf_mad(struct ib_device *dev, int port_num, __be16 attr,
+               void *data, int offset, size_t size)
 {
-       struct port_table_attribute *tab_attr =
-               container_of(attr, struct port_table_attribute, attr);
-       int offset = tab_attr->index & 0xffff;
-       int width  = (tab_attr->index >> 16) & 0xff;
-       struct ib_mad *in_mad  = NULL;
-       struct ib_mad *out_mad = NULL;
+       struct ib_mad *in_mad;
+       struct ib_mad *out_mad;
        size_t mad_size = sizeof(*out_mad);
        u16 out_mad_pkey_index = 0;
        ssize_t ret;
 
-       if (!p->ibdev->process_mad)
-               return sprintf(buf, "N/A (no PMA)\n");
+       if (!dev->process_mad)
+               return -ENOSYS;
 
        in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
        out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
@@ -344,12 +436,13 @@ static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr,
        in_mad->mad_hdr.mgmt_class    = IB_MGMT_CLASS_PERF_MGMT;
        in_mad->mad_hdr.class_version = 1;
        in_mad->mad_hdr.method        = IB_MGMT_METHOD_GET;
-       in_mad->mad_hdr.attr_id       = cpu_to_be16(0x12); /* PortCounters */
+       in_mad->mad_hdr.attr_id       = attr;
 
-       in_mad->data[41] = p->port_num; /* PortSelect field */
+       if (attr != IB_PMA_CLASS_PORT_INFO)
+               in_mad->data[41] = port_num;    /* PortSelect field */
 
-       if ((p->ibdev->process_mad(p->ibdev, IB_MAD_IGNORE_MKEY,
-                p->port_num, NULL, NULL,
+       if ((dev->process_mad(dev, IB_MAD_IGNORE_MKEY,
+                port_num, NULL, NULL,
                 (const struct ib_mad_hdr *)in_mad, mad_size,
                 (struct ib_mad_hdr *)out_mad, &mad_size,
                 &out_mad_pkey_index) &
@@ -358,31 +451,54 @@ static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr,
                ret = -EINVAL;
                goto out;
        }
+       memcpy(data, out_mad->data + offset, size);
+       ret = size;
+out:
+       kfree(in_mad);
+       kfree(out_mad);
+       return ret;
+}
+
+static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr,
+                               char *buf)
+{
+       struct port_table_attribute *tab_attr =
+               container_of(attr, struct port_table_attribute, attr);
+       int offset = tab_attr->index & 0xffff;
+       int width  = (tab_attr->index >> 16) & 0xff;
+       ssize_t ret;
+       u8 data[8];
+
+       ret = get_perf_mad(p->ibdev, p->port_num, tab_attr->attr_id, &data,
+                       40 + offset / 8, sizeof(data));
+       if (ret < 0)
+               return sprintf(buf, "N/A (no PMA)\n");
 
        switch (width) {
        case 4:
-               ret = sprintf(buf, "%u\n", (out_mad->data[40 + offset / 8] >>
+               ret = sprintf(buf, "%u\n", (*data >>
                                            (4 - (offset % 8))) & 0xf);
                break;
        case 8:
-               ret = sprintf(buf, "%u\n", out_mad->data[40 + offset / 8]);
+               ret = sprintf(buf, "%u\n", *data);
                break;
        case 16:
                ret = sprintf(buf, "%u\n",
-                             be16_to_cpup((__be16 *)(out_mad->data + 40 + offset / 8)));
+                             be16_to_cpup((__be16 *)data));
                break;
        case 32:
                ret = sprintf(buf, "%u\n",
-                             be32_to_cpup((__be32 *)(out_mad->data + 40 + offset / 8)));
+                             be32_to_cpup((__be32 *)data));
+               break;
+       case 64:
+               ret = sprintf(buf, "%llu\n",
+                               be64_to_cpup((__be64 *)data));
                break;
+
        default:
                ret = 0;
        }
 
-out:
-       kfree(in_mad);
-       kfree(out_mad);
-
        return ret;
 }
 
@@ -403,6 +519,18 @@ static PORT_PMA_ATTR(port_rcv_data             , 13, 32, 224);
 static PORT_PMA_ATTR(port_xmit_packets             , 14, 32, 256);
 static PORT_PMA_ATTR(port_rcv_packets              , 15, 32, 288);
 
+/*
+ * Counters added by extended set
+ */
+static PORT_PMA_ATTR_EXT(port_xmit_data                    , 64,  64);
+static PORT_PMA_ATTR_EXT(port_rcv_data             , 64, 128);
+static PORT_PMA_ATTR_EXT(port_xmit_packets         , 64, 192);
+static PORT_PMA_ATTR_EXT(port_rcv_packets          , 64, 256);
+static PORT_PMA_ATTR_EXT(unicast_xmit_packets      , 64, 320);
+static PORT_PMA_ATTR_EXT(unicast_rcv_packets       , 64, 384);
+static PORT_PMA_ATTR_EXT(multicast_xmit_packets            , 64, 448);
+static PORT_PMA_ATTR_EXT(multicast_rcv_packets     , 64, 512);
+
 static struct attribute *pma_attrs[] = {
        &port_pma_attr_symbol_error.attr.attr,
        &port_pma_attr_link_error_recovery.attr.attr,
@@ -423,11 +551,65 @@ static struct attribute *pma_attrs[] = {
        NULL
 };
 
+static struct attribute *pma_attrs_ext[] = {
+       &port_pma_attr_symbol_error.attr.attr,
+       &port_pma_attr_link_error_recovery.attr.attr,
+       &port_pma_attr_link_downed.attr.attr,
+       &port_pma_attr_port_rcv_errors.attr.attr,
+       &port_pma_attr_port_rcv_remote_physical_errors.attr.attr,
+       &port_pma_attr_port_rcv_switch_relay_errors.attr.attr,
+       &port_pma_attr_port_xmit_discards.attr.attr,
+       &port_pma_attr_port_xmit_constraint_errors.attr.attr,
+       &port_pma_attr_port_rcv_constraint_errors.attr.attr,
+       &port_pma_attr_local_link_integrity_errors.attr.attr,
+       &port_pma_attr_excessive_buffer_overrun_errors.attr.attr,
+       &port_pma_attr_VL15_dropped.attr.attr,
+       &port_pma_attr_ext_port_xmit_data.attr.attr,
+       &port_pma_attr_ext_port_rcv_data.attr.attr,
+       &port_pma_attr_ext_port_xmit_packets.attr.attr,
+       &port_pma_attr_ext_port_rcv_packets.attr.attr,
+       &port_pma_attr_ext_unicast_rcv_packets.attr.attr,
+       &port_pma_attr_ext_unicast_xmit_packets.attr.attr,
+       &port_pma_attr_ext_multicast_rcv_packets.attr.attr,
+       &port_pma_attr_ext_multicast_xmit_packets.attr.attr,
+       NULL
+};
+
+static struct attribute *pma_attrs_noietf[] = {
+       &port_pma_attr_symbol_error.attr.attr,
+       &port_pma_attr_link_error_recovery.attr.attr,
+       &port_pma_attr_link_downed.attr.attr,
+       &port_pma_attr_port_rcv_errors.attr.attr,
+       &port_pma_attr_port_rcv_remote_physical_errors.attr.attr,
+       &port_pma_attr_port_rcv_switch_relay_errors.attr.attr,
+       &port_pma_attr_port_xmit_discards.attr.attr,
+       &port_pma_attr_port_xmit_constraint_errors.attr.attr,
+       &port_pma_attr_port_rcv_constraint_errors.attr.attr,
+       &port_pma_attr_local_link_integrity_errors.attr.attr,
+       &port_pma_attr_excessive_buffer_overrun_errors.attr.attr,
+       &port_pma_attr_VL15_dropped.attr.attr,
+       &port_pma_attr_ext_port_xmit_data.attr.attr,
+       &port_pma_attr_ext_port_rcv_data.attr.attr,
+       &port_pma_attr_ext_port_xmit_packets.attr.attr,
+       &port_pma_attr_ext_port_rcv_packets.attr.attr,
+       NULL
+};
+
 static struct attribute_group pma_group = {
        .name  = "counters",
        .attrs  = pma_attrs
 };
 
+static struct attribute_group pma_group_ext = {
+       .name  = "counters",
+       .attrs  = pma_attrs_ext
+};
+
+static struct attribute_group pma_group_noietf = {
+       .name  = "counters",
+       .attrs  = pma_attrs_noietf
+};
+
 static void ib_port_release(struct kobject *kobj)
 {
        struct ib_port *p = container_of(kobj, struct ib_port, kobj);
@@ -451,12 +633,41 @@ static void ib_port_release(struct kobject *kobj)
        kfree(p);
 }
 
+static void ib_port_gid_attr_release(struct kobject *kobj)
+{
+       struct gid_attr_group *g = container_of(kobj, struct gid_attr_group,
+                                               kobj);
+       struct attribute *a;
+       int i;
+
+       if (g->ndev.attrs) {
+               for (i = 0; (a = g->ndev.attrs[i]); ++i)
+                       kfree(a);
+
+               kfree(g->ndev.attrs);
+       }
+
+       if (g->type.attrs) {
+               for (i = 0; (a = g->type.attrs[i]); ++i)
+                       kfree(a);
+
+               kfree(g->type.attrs);
+       }
+
+       kfree(g);
+}
+
 static struct kobj_type port_type = {
        .release       = ib_port_release,
        .sysfs_ops     = &port_sysfs_ops,
        .default_attrs = port_default_attrs
 };
 
+static struct kobj_type gid_attr_type = {
+       .sysfs_ops      = &gid_attr_sysfs_ops,
+       .release        = ib_port_gid_attr_release
+};
+
 static struct attribute **
 alloc_group_attrs(ssize_t (*show)(struct ib_port *,
                                  struct port_attribute *, char *buf),
@@ -500,6 +711,31 @@ err:
        return NULL;
 }
 
+/*
+ * Figure out which counter table to use depending on
+ * the device capabilities.
+ */
+static struct attribute_group *get_counter_table(struct ib_device *dev,
+                                                int port_num)
+{
+       struct ib_class_port_info cpi;
+
+       if (get_perf_mad(dev, port_num, IB_PMA_CLASS_PORT_INFO,
+                               &cpi, 40, sizeof(cpi)) >= 0) {
+
+               if (cpi.capability_mask && IB_PMA_CLASS_CAP_EXT_WIDTH)
+                       /* We have extended counters */
+                       return &pma_group_ext;
+
+               if (cpi.capability_mask && IB_PMA_CLASS_CAP_EXT_WIDTH_NOIETF)
+                       /* But not the IETF ones */
+                       return &pma_group_noietf;
+       }
+
+       /* Fall back to normal counters */
+       return &pma_group;
+}
+
 static int add_port(struct ib_device *device, int port_num,
                    int (*port_callback)(struct ib_device *,
                                         u8, struct kobject *))
@@ -528,9 +764,24 @@ static int add_port(struct ib_device *device, int port_num,
                return ret;
        }
 
-       ret = sysfs_create_group(&p->kobj, &pma_group);
-       if (ret)
+       p->gid_attr_group = kzalloc(sizeof(*p->gid_attr_group), GFP_KERNEL);
+       if (!p->gid_attr_group) {
+               ret = -ENOMEM;
                goto err_put;
+       }
+
+       p->gid_attr_group->port = p;
+       ret = kobject_init_and_add(&p->gid_attr_group->kobj, &gid_attr_type,
+                                  &p->kobj, "gid_attrs");
+       if (ret) {
+               kfree(p->gid_attr_group);
+               goto err_put;
+       }
+
+       p->pma_table = get_counter_table(device, port_num);
+       ret = sysfs_create_group(&p->kobj, p->pma_table);
+       if (ret)
+               goto err_put_gid_attrs;
 
        p->gid_group.name  = "gids";
        p->gid_group.attrs = alloc_group_attrs(show_port_gid, attr.gid_tbl_len);
@@ -543,12 +794,38 @@ static int add_port(struct ib_device *device, int port_num,
        if (ret)
                goto err_free_gid;
 
+       p->gid_attr_group->ndev.name = "ndevs";
+       p->gid_attr_group->ndev.attrs = alloc_group_attrs(show_port_gid_attr_ndev,
+                                                         attr.gid_tbl_len);
+       if (!p->gid_attr_group->ndev.attrs) {
+               ret = -ENOMEM;
+               goto err_remove_gid;
+       }
+
+       ret = sysfs_create_group(&p->gid_attr_group->kobj,
+                                &p->gid_attr_group->ndev);
+       if (ret)
+               goto err_free_gid_ndev;
+
+       p->gid_attr_group->type.name = "types";
+       p->gid_attr_group->type.attrs = alloc_group_attrs(show_port_gid_attr_gid_type,
+                                                         attr.gid_tbl_len);
+       if (!p->gid_attr_group->type.attrs) {
+               ret = -ENOMEM;
+               goto err_remove_gid_ndev;
+       }
+
+       ret = sysfs_create_group(&p->gid_attr_group->kobj,
+                                &p->gid_attr_group->type);
+       if (ret)
+               goto err_free_gid_type;
+
        p->pkey_group.name  = "pkeys";
        p->pkey_group.attrs = alloc_group_attrs(show_port_pkey,
                                                attr.pkey_tbl_len);
        if (!p->pkey_group.attrs) {
                ret = -ENOMEM;
-               goto err_remove_gid;
+               goto err_remove_gid_type;
        }
 
        ret = sysfs_create_group(&p->kobj, &p->pkey_group);
@@ -576,6 +853,28 @@ err_free_pkey:
        kfree(p->pkey_group.attrs);
        p->pkey_group.attrs = NULL;
 
+err_remove_gid_type:
+       sysfs_remove_group(&p->gid_attr_group->kobj,
+                          &p->gid_attr_group->type);
+
+err_free_gid_type:
+       for (i = 0; i < attr.gid_tbl_len; ++i)
+               kfree(p->gid_attr_group->type.attrs[i]);
+
+       kfree(p->gid_attr_group->type.attrs);
+       p->gid_attr_group->type.attrs = NULL;
+
+err_remove_gid_ndev:
+       sysfs_remove_group(&p->gid_attr_group->kobj,
+                          &p->gid_attr_group->ndev);
+
+err_free_gid_ndev:
+       for (i = 0; i < attr.gid_tbl_len; ++i)
+               kfree(p->gid_attr_group->ndev.attrs[i]);
+
+       kfree(p->gid_attr_group->ndev.attrs);
+       p->gid_attr_group->ndev.attrs = NULL;
+
 err_remove_gid:
        sysfs_remove_group(&p->kobj, &p->gid_group);
 
@@ -587,7 +886,10 @@ err_free_gid:
        p->gid_group.attrs = NULL;
 
 err_remove_pma:
-       sysfs_remove_group(&p->kobj, &pma_group);
+       sysfs_remove_group(&p->kobj, p->pma_table);
+
+err_put_gid_attrs:
+       kobject_put(&p->gid_attr_group->kobj);
 
 err_put:
        kobject_put(&p->kobj);
@@ -614,18 +916,12 @@ static ssize_t show_sys_image_guid(struct device *device,
                                   struct device_attribute *dev_attr, char *buf)
 {
        struct ib_device *dev = container_of(device, struct ib_device, dev);
-       struct ib_device_attr attr;
-       ssize_t ret;
-
-       ret = ib_query_device(dev, &attr);
-       if (ret)
-               return ret;
 
        return sprintf(buf, "%04x:%04x:%04x:%04x\n",
-                      be16_to_cpu(((__be16 *) &attr.sys_image_guid)[0]),
-                      be16_to_cpu(((__be16 *) &attr.sys_image_guid)[1]),
-                      be16_to_cpu(((__be16 *) &attr.sys_image_guid)[2]),
-                      be16_to_cpu(((__be16 *) &attr.sys_image_guid)[3]));
+                      be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[0]),
+                      be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[1]),
+                      be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[2]),
+                      be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[3]));
 }
 
 static ssize_t show_node_guid(struct device *device,
@@ -800,9 +1096,14 @@ static void free_port_list_attributes(struct ib_device *device)
        list_for_each_entry_safe(p, t, &device->port_list, entry) {
                struct ib_port *port = container_of(p, struct ib_port, kobj);
                list_del(&p->entry);
-               sysfs_remove_group(p, &pma_group);
+               sysfs_remove_group(p, port->pma_table);
                sysfs_remove_group(p, &port->pkey_group);
                sysfs_remove_group(p, &port->gid_group);
+               sysfs_remove_group(&port->gid_attr_group->kobj,
+                                  &port->gid_attr_group->ndev);
+               sysfs_remove_group(&port->gid_attr_group->kobj,
+                                  &port->gid_attr_group->type);
+               kobject_put(&port->gid_attr_group->kobj);
                kobject_put(p);
        }
 
index 72feee620ebfb33300a2121a2a22d0fc1d1eafb3..19837d2702789e63f1d948c1abaf4bae9a28d7f6 100644 (file)
@@ -35,6 +35,7 @@
 #include <linux/string.h>
 #include <linux/export.h>
 #include <linux/if_ether.h>
+#include <linux/ip.h>
 
 #include <rdma/ib_pack.h>
 
@@ -116,6 +117,72 @@ static const struct ib_field vlan_table[]  = {
          .size_bits    = 16 }
 };
 
+static const struct ib_field ip4_table[]  = {
+       { STRUCT_FIELD(ip4, ver),
+         .offset_words = 0,
+         .offset_bits  = 0,
+         .size_bits    = 4 },
+       { STRUCT_FIELD(ip4, hdr_len),
+         .offset_words = 0,
+         .offset_bits  = 4,
+         .size_bits    = 4 },
+       { STRUCT_FIELD(ip4, tos),
+         .offset_words = 0,
+         .offset_bits  = 8,
+         .size_bits    = 8 },
+       { STRUCT_FIELD(ip4, tot_len),
+         .offset_words = 0,
+         .offset_bits  = 16,
+         .size_bits    = 16 },
+       { STRUCT_FIELD(ip4, id),
+         .offset_words = 1,
+         .offset_bits  = 0,
+         .size_bits    = 16 },
+       { STRUCT_FIELD(ip4, frag_off),
+         .offset_words = 1,
+         .offset_bits  = 16,
+         .size_bits    = 16 },
+       { STRUCT_FIELD(ip4, ttl),
+         .offset_words = 2,
+         .offset_bits  = 0,
+         .size_bits    = 8 },
+       { STRUCT_FIELD(ip4, protocol),
+         .offset_words = 2,
+         .offset_bits  = 8,
+         .size_bits    = 8 },
+       { STRUCT_FIELD(ip4, check),
+         .offset_words = 2,
+         .offset_bits  = 16,
+         .size_bits    = 16 },
+       { STRUCT_FIELD(ip4, saddr),
+         .offset_words = 3,
+         .offset_bits  = 0,
+         .size_bits    = 32 },
+       { STRUCT_FIELD(ip4, daddr),
+         .offset_words = 4,
+         .offset_bits  = 0,
+         .size_bits    = 32 }
+};
+
+static const struct ib_field udp_table[]  = {
+       { STRUCT_FIELD(udp, sport),
+         .offset_words = 0,
+         .offset_bits  = 0,
+         .size_bits    = 16 },
+       { STRUCT_FIELD(udp, dport),
+         .offset_words = 0,
+         .offset_bits  = 16,
+         .size_bits    = 16 },
+       { STRUCT_FIELD(udp, length),
+         .offset_words = 1,
+         .offset_bits  = 0,
+         .size_bits    = 16 },
+       { STRUCT_FIELD(udp, csum),
+         .offset_words = 1,
+         .offset_bits  = 16,
+         .size_bits    = 16 }
+};
+
 static const struct ib_field grh_table[]  = {
        { STRUCT_FIELD(grh, ip_version),
          .offset_words = 0,
@@ -213,26 +280,57 @@ static const struct ib_field deth_table[] = {
          .size_bits    = 24 }
 };
 
+__sum16 ib_ud_ip4_csum(struct ib_ud_header *header)
+{
+       struct iphdr iph;
+
+       iph.ihl         = 5;
+       iph.version     = 4;
+       iph.tos         = header->ip4.tos;
+       iph.tot_len     = header->ip4.tot_len;
+       iph.id          = header->ip4.id;
+       iph.frag_off    = header->ip4.frag_off;
+       iph.ttl         = header->ip4.ttl;
+       iph.protocol    = header->ip4.protocol;
+       iph.check       = 0;
+       iph.saddr       = header->ip4.saddr;
+       iph.daddr       = header->ip4.daddr;
+
+       return ip_fast_csum((u8 *)&iph, iph.ihl);
+}
+EXPORT_SYMBOL(ib_ud_ip4_csum);
+
 /**
  * ib_ud_header_init - Initialize UD header structure
  * @payload_bytes:Length of packet payload
  * @lrh_present: specify if LRH is present
  * @eth_present: specify if Eth header is present
  * @vlan_present: packet is tagged vlan
- * @grh_present:GRH flag (if non-zero, GRH will be included)
+ * @grh_present: GRH flag (if non-zero, GRH will be included)
+ * @ip_version: if non-zero, IP header, V4 or V6, will be included
+ * @udp_present :if non-zero, UDP header will be included
  * @immediate_present: specify if immediate data is present
  * @header:Structure to initialize
  */
-void ib_ud_header_init(int                         payload_bytes,
-                      int                  lrh_present,
-                      int                  eth_present,
-                      int                  vlan_present,
-                      int                  grh_present,
-                      int                  immediate_present,
-                      struct ib_ud_header *header)
+int ib_ud_header_init(int     payload_bytes,
+                     int    lrh_present,
+                     int    eth_present,
+                     int    vlan_present,
+                     int    grh_present,
+                     int    ip_version,
+                     int    udp_present,
+                     int    immediate_present,
+                     struct ib_ud_header *header)
 {
+       grh_present = grh_present && !ip_version;
        memset(header, 0, sizeof *header);
 
+       /*
+        * UDP header without IP header doesn't make sense
+        */
+       if (udp_present && ip_version != 4 && ip_version != 6)
+               return -EINVAL;
+
        if (lrh_present) {
                u16 packet_length;
 
@@ -252,7 +350,7 @@ void ib_ud_header_init(int                      payload_bytes,
        if (vlan_present)
                header->eth.type = cpu_to_be16(ETH_P_8021Q);
 
-       if (grh_present) {
+       if (ip_version == 6 || grh_present) {
                header->grh.ip_version      = 6;
                header->grh.payload_length  =
                        cpu_to_be16((IB_BTH_BYTES     +
@@ -260,8 +358,30 @@ void ib_ud_header_init(int                     payload_bytes,
                                     payload_bytes    +
                                     4                + /* ICRC     */
                                     3) & ~3);          /* round up */
-               header->grh.next_header     = 0x1b;
+               header->grh.next_header     = udp_present ? IPPROTO_UDP : 0x1b;
+       }
+
+       if (ip_version == 4) {
+               int udp_bytes = udp_present ? IB_UDP_BYTES : 0;
+
+               header->ip4.ver = 4; /* version 4 */
+               header->ip4.hdr_len = 5; /* 5 words */
+               header->ip4.tot_len =
+                       cpu_to_be16(IB_IP4_BYTES   +
+                                    udp_bytes     +
+                                    IB_BTH_BYTES  +
+                                    IB_DETH_BYTES +
+                                    payload_bytes +
+                                    4);     /* ICRC     */
+               header->ip4.protocol = IPPROTO_UDP;
        }
+       if (udp_present && ip_version)
+               header->udp.length =
+                       cpu_to_be16(IB_UDP_BYTES   +
+                                    IB_BTH_BYTES  +
+                                    IB_DETH_BYTES +
+                                    payload_bytes +
+                                    4);     /* ICRC     */
 
        if (immediate_present)
                header->bth.opcode           = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
@@ -273,8 +393,11 @@ void ib_ud_header_init(int                     payload_bytes,
        header->lrh_present = lrh_present;
        header->eth_present = eth_present;
        header->vlan_present = vlan_present;
-       header->grh_present = grh_present;
+       header->grh_present = grh_present || (ip_version == 6);
+       header->ipv4_present = ip_version == 4;
+       header->udp_present = udp_present;
        header->immediate_present = immediate_present;
+       return 0;
 }
 EXPORT_SYMBOL(ib_ud_header_init);
 
@@ -311,6 +434,16 @@ int ib_ud_header_pack(struct ib_ud_header *header,
                        &header->grh, buf + len);
                len += IB_GRH_BYTES;
        }
+       if (header->ipv4_present) {
+               ib_pack(ip4_table, ARRAY_SIZE(ip4_table),
+                       &header->ip4, buf + len);
+               len += IB_IP4_BYTES;
+       }
+       if (header->udp_present) {
+               ib_pack(udp_table, ARRAY_SIZE(udp_table),
+                       &header->udp, buf + len);
+               len += IB_UDP_BYTES;
+       }
 
        ib_pack(bth_table, ARRAY_SIZE(bth_table),
                &header->bth, buf + len);
index 40becdb3196e07b97c94ade818af5755bfaae4db..e69bf266049d0117830577167c2987c83cd8ac62 100644 (file)
@@ -232,7 +232,7 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
        ib_ucontext_notifier_end_account(context);
 }
 
-static struct mmu_notifier_ops ib_umem_notifiers = {
+static const struct mmu_notifier_ops ib_umem_notifiers = {
        .release                    = ib_umem_notifier_release,
        .invalidate_page            = ib_umem_notifier_invalidate_page,
        .invalidate_range_start     = ib_umem_notifier_invalidate_range_start,
index 57f281f8d686224b7a40488330b78ac38943270c..415a3185cde7f45dfc583180ef1a4ef81fec8699 100644 (file)
@@ -210,6 +210,7 @@ static void send_handler(struct ib_mad_agent *agent,
 }
 
 static void recv_handler(struct ib_mad_agent *agent,
+                        struct ib_mad_send_buf *send_buf,
                         struct ib_mad_recv_wc *mad_recv_wc)
 {
        struct ib_umad_file *file = agent->context;
index 94bbd8c155fcca1daf5f2e9ee0fc65552821ff3f..612ccfd39bf98181693f07c5dfcc661dc8191e39 100644 (file)
@@ -204,6 +204,8 @@ void ib_uverbs_event_handler(struct ib_event_handler *handler,
                             struct ib_event *event);
 void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, struct ib_xrcd *xrcd);
 
+int uverbs_dealloc_mw(struct ib_mw *mw);
+
 struct ib_uverbs_flow_spec {
        union {
                union {
index 1c02deab068fbf1031d5b21db960feb9b06865c5..6ffc9c4e93afb4efa27fe6f3c17fc3cdf41d8c21 100644 (file)
@@ -291,9 +291,6 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
        struct ib_uverbs_get_context      cmd;
        struct ib_uverbs_get_context_resp resp;
        struct ib_udata                   udata;
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-       struct ib_device_attr             dev_attr;
-#endif
        struct ib_ucontext               *ucontext;
        struct file                      *filp;
        int ret;
@@ -342,10 +339,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
        ucontext->odp_mrs_count = 0;
        INIT_LIST_HEAD(&ucontext->no_private_counters);
 
-       ret = ib_query_device(ib_dev, &dev_attr);
-       if (ret)
-               goto err_free;
-       if (!(dev_attr.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
+       if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
                ucontext->invalidate_range = NULL;
 
 #endif
@@ -447,8 +441,6 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
 {
        struct ib_uverbs_query_device      cmd;
        struct ib_uverbs_query_device_resp resp;
-       struct ib_device_attr              attr;
-       int                                ret;
 
        if (out_len < sizeof resp)
                return -ENOSPC;
@@ -456,12 +448,8 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
        if (copy_from_user(&cmd, buf, sizeof cmd))
                return -EFAULT;
 
-       ret = ib_query_device(ib_dev, &attr);
-       if (ret)
-               return ret;
-
        memset(&resp, 0, sizeof resp);
-       copy_query_dev_fields(file, ib_dev, &resp, &attr);
+       copy_query_dev_fields(file, ib_dev, &resp, &ib_dev->attrs);
 
        if (copy_to_user((void __user *) (unsigned long) cmd.response,
                         &resp, sizeof resp))
@@ -986,11 +974,8 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
        }
 
        if (cmd.access_flags & IB_ACCESS_ON_DEMAND) {
-               struct ib_device_attr attr;
-
-               ret = ib_query_device(pd->device, &attr);
-               if (ret || !(attr.device_cap_flags &
-                               IB_DEVICE_ON_DEMAND_PAGING)) {
+               if (!(pd->device->attrs.device_cap_flags &
+                     IB_DEVICE_ON_DEMAND_PAGING)) {
                        pr_debug("ODP support not available\n");
                        ret = -EINVAL;
                        goto err_put;
@@ -1008,7 +993,6 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
        mr->pd      = pd;
        mr->uobject = uobj;
        atomic_inc(&pd->usecnt);
-       atomic_set(&mr->usecnt, 0);
 
        uobj->object = mr;
        ret = idr_add_uobj(&ib_uverbs_mr_idr, uobj);
@@ -1106,11 +1090,6 @@ ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file,
                }
        }
 
-       if (atomic_read(&mr->usecnt)) {
-               ret = -EBUSY;
-               goto put_uobj_pd;
-       }
-
        old_pd = mr->pd;
        ret = mr->device->rereg_user_mr(mr, cmd.flags, cmd.start,
                                        cmd.length, cmd.hca_va,
@@ -1258,7 +1237,7 @@ err_copy:
        idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
 
 err_unalloc:
-       ib_dealloc_mw(mw);
+       uverbs_dealloc_mw(mw);
 
 err_put:
        put_pd_read(pd);
@@ -1287,7 +1266,7 @@ ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
 
        mw = uobj->object;
 
-       ret = ib_dealloc_mw(mw);
+       ret = uverbs_dealloc_mw(mw);
        if (!ret)
                uobj->live = 0;
 
@@ -1845,7 +1824,10 @@ static int create_qp(struct ib_uverbs_file *file,
                      sizeof(cmd->create_flags))
                attr.create_flags = cmd->create_flags;
 
-       if (attr.create_flags & ~IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) {
+       if (attr.create_flags & ~(IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK |
+                               IB_QP_CREATE_CROSS_CHANNEL |
+                               IB_QP_CREATE_MANAGED_SEND |
+                               IB_QP_CREATE_MANAGED_RECV)) {
                ret = -EINVAL;
                goto err_put;
        }
index e3ef28861be63dd453d0fd1b55ec21f939919624..39680aed99dd0d4593f91317e666393c562d33d3 100644 (file)
@@ -133,6 +133,17 @@ static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
 static void ib_uverbs_add_one(struct ib_device *device);
 static void ib_uverbs_remove_one(struct ib_device *device, void *client_data);
 
+int uverbs_dealloc_mw(struct ib_mw *mw)
+{
+       struct ib_pd *pd = mw->pd;
+       int ret;
+
+       ret = mw->device->dealloc_mw(mw);
+       if (!ret)
+               atomic_dec(&pd->usecnt);
+       return ret;
+}
+
 static void ib_uverbs_release_dev(struct kobject *kobj)
 {
        struct ib_uverbs_device *dev =
@@ -224,7 +235,7 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
                struct ib_mw *mw = uobj->object;
 
                idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
-               ib_dealloc_mw(mw);
+               uverbs_dealloc_mw(mw);
                kfree(uobj);
        }
 
index 7d2f14c9bbefaf1f3c28eac11b7bcefbc2802927..af020f80d50f4888436db49f935f29a0f5f5b22e 100644 (file)
@@ -144,5 +144,6 @@ void ib_copy_path_rec_from_user(struct ib_sa_path_rec *dst,
        memset(dst->dmac, 0, sizeof(dst->dmac));
        dst->net = NULL;
        dst->ifindex = 0;
+       dst->gid_type = IB_GID_TYPE_IB;
 }
 EXPORT_SYMBOL(ib_copy_path_rec_from_user);
index 545906dec26dc5e936df37c635c68b94bb27e226..5af6d024e0538a9eb63049ef7009ad7dab09a967 100644 (file)
@@ -229,12 +229,6 @@ EXPORT_SYMBOL(rdma_port_get_link_layer);
 struct ib_pd *ib_alloc_pd(struct ib_device *device)
 {
        struct ib_pd *pd;
-       struct ib_device_attr devattr;
-       int rc;
-
-       rc = ib_query_device(device, &devattr);
-       if (rc)
-               return ERR_PTR(rc);
 
        pd = device->alloc_pd(device, NULL, NULL);
        if (IS_ERR(pd))
@@ -245,7 +239,7 @@ struct ib_pd *ib_alloc_pd(struct ib_device *device)
        pd->local_mr = NULL;
        atomic_set(&pd->usecnt, 0);
 
-       if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
+       if (device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
                pd->local_dma_lkey = device->local_dma_lkey;
        else {
                struct ib_mr *mr;
@@ -311,8 +305,61 @@ struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
 }
 EXPORT_SYMBOL(ib_create_ah);
 
+static int ib_get_header_version(const union rdma_network_hdr *hdr)
+{
+       const struct iphdr *ip4h = (struct iphdr *)&hdr->roce4grh;
+       struct iphdr ip4h_checked;
+       const struct ipv6hdr *ip6h = (struct ipv6hdr *)&hdr->ibgrh;
+
+       /* If it's IPv6, the version must be 6, otherwise, the first
+        * 20 bytes (before the IPv4 header) are garbled.
+        */
+       if (ip6h->version != 6)
+               return (ip4h->version == 4) ? 4 : 0;
+       /* version may be 6 or 4 because the first 20 bytes could be garbled */
+
+       /* RoCE v2 requires no options, thus header length
+        * must be 5 words
+        */
+       if (ip4h->ihl != 5)
+               return 6;
+
+       /* Verify checksum.
+        * We can't write on scattered buffers so we need to copy to
+        * temp buffer.
+        */
+       memcpy(&ip4h_checked, ip4h, sizeof(ip4h_checked));
+       ip4h_checked.check = 0;
+       ip4h_checked.check = ip_fast_csum((u8 *)&ip4h_checked, 5);
+       /* if IPv4 header checksum is OK, believe it */
+       if (ip4h->check == ip4h_checked.check)
+               return 4;
+       return 6;
+}
+
+static enum rdma_network_type ib_get_net_type_by_grh(struct ib_device *device,
+                                                    u8 port_num,
+                                                    const struct ib_grh *grh)
+{
+       int grh_version;
+
+       if (rdma_protocol_ib(device, port_num))
+               return RDMA_NETWORK_IB;
+
+       grh_version = ib_get_header_version((union rdma_network_hdr *)grh);
+
+       if (grh_version == 4)
+               return RDMA_NETWORK_IPV4;
+
+       if (grh->next_hdr == IPPROTO_UDP)
+               return RDMA_NETWORK_IPV6;
+
+       return RDMA_NETWORK_ROCE_V1;
+}
+
 struct find_gid_index_context {
        u16 vlan_id;
+       enum ib_gid_type gid_type;
 };
 
 static bool find_gid_index(const union ib_gid *gid,
@@ -322,6 +369,9 @@ static bool find_gid_index(const union ib_gid *gid,
        struct find_gid_index_context *ctx =
                (struct find_gid_index_context *)context;
 
+       if (ctx->gid_type != gid_attr->gid_type)
+               return false;
+
        if ((!!(ctx->vlan_id != 0xffff) == !is_vlan_dev(gid_attr->ndev)) ||
            (is_vlan_dev(gid_attr->ndev) &&
             vlan_dev_vlan_id(gid_attr->ndev) != ctx->vlan_id))
@@ -332,14 +382,49 @@ static bool find_gid_index(const union ib_gid *gid,
 
 static int get_sgid_index_from_eth(struct ib_device *device, u8 port_num,
                                   u16 vlan_id, const union ib_gid *sgid,
+                                  enum ib_gid_type gid_type,
                                   u16 *gid_index)
 {
-       struct find_gid_index_context context = {.vlan_id = vlan_id};
+       struct find_gid_index_context context = {.vlan_id = vlan_id,
+                                                .gid_type = gid_type};
 
        return ib_find_gid_by_filter(device, sgid, port_num, find_gid_index,
                                     &context, gid_index);
 }
 
+static int get_gids_from_rdma_hdr(union rdma_network_hdr *hdr,
+                                 enum rdma_network_type net_type,
+                                 union ib_gid *sgid, union ib_gid *dgid)
+{
+       struct sockaddr_in  src_in;
+       struct sockaddr_in  dst_in;
+       __be32 src_saddr, dst_saddr;
+
+       if (!sgid || !dgid)
+               return -EINVAL;
+
+       if (net_type == RDMA_NETWORK_IPV4) {
+               memcpy(&src_in.sin_addr.s_addr,
+                      &hdr->roce4grh.saddr, 4);
+               memcpy(&dst_in.sin_addr.s_addr,
+                      &hdr->roce4grh.daddr, 4);
+               src_saddr = src_in.sin_addr.s_addr;
+               dst_saddr = dst_in.sin_addr.s_addr;
+               ipv6_addr_set_v4mapped(src_saddr,
+                                      (struct in6_addr *)sgid);
+               ipv6_addr_set_v4mapped(dst_saddr,
+                                      (struct in6_addr *)dgid);
+               return 0;
+       } else if (net_type == RDMA_NETWORK_IPV6 ||
+                  net_type == RDMA_NETWORK_IB) {
+               *dgid = hdr->ibgrh.dgid;
+               *sgid = hdr->ibgrh.sgid;
+               return 0;
+       } else {
+               return -EINVAL;
+       }
+}
+
 int ib_init_ah_from_wc(struct ib_device *device, u8 port_num,
                       const struct ib_wc *wc, const struct ib_grh *grh,
                       struct ib_ah_attr *ah_attr)
@@ -347,33 +432,72 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num,
        u32 flow_class;
        u16 gid_index;
        int ret;
+       enum rdma_network_type net_type = RDMA_NETWORK_IB;
+       enum ib_gid_type gid_type = IB_GID_TYPE_IB;
+       int hoplimit = 0xff;
+       union ib_gid dgid;
+       union ib_gid sgid;
 
        memset(ah_attr, 0, sizeof *ah_attr);
        if (rdma_cap_eth_ah(device, port_num)) {
+               if (wc->wc_flags & IB_WC_WITH_NETWORK_HDR_TYPE)
+                       net_type = wc->network_hdr_type;
+               else
+                       net_type = ib_get_net_type_by_grh(device, port_num, grh);
+               gid_type = ib_network_to_gid_type(net_type);
+       }
+       ret = get_gids_from_rdma_hdr((union rdma_network_hdr *)grh, net_type,
+                                    &sgid, &dgid);
+       if (ret)
+               return ret;
+
+       if (rdma_protocol_roce(device, port_num)) {
+               int if_index = 0;
                u16 vlan_id = wc->wc_flags & IB_WC_WITH_VLAN ?
                                wc->vlan_id : 0xffff;
+               struct net_device *idev;
+               struct net_device *resolved_dev;
 
                if (!(wc->wc_flags & IB_WC_GRH))
                        return -EPROTOTYPE;
 
-               if (!(wc->wc_flags & IB_WC_WITH_SMAC) ||
-                   !(wc->wc_flags & IB_WC_WITH_VLAN)) {
-                       ret = rdma_addr_find_dmac_by_grh(&grh->dgid, &grh->sgid,
-                                                        ah_attr->dmac,
-                                                        wc->wc_flags & IB_WC_WITH_VLAN ?
-                                                        NULL : &vlan_id,
-                                                        0);
-                       if (ret)
-                               return ret;
+               if (!device->get_netdev)
+                       return -EOPNOTSUPP;
+
+               idev = device->get_netdev(device, port_num);
+               if (!idev)
+                       return -ENODEV;
+
+               ret = rdma_addr_find_l2_eth_by_grh(&dgid, &sgid,
+                                                  ah_attr->dmac,
+                                                  wc->wc_flags & IB_WC_WITH_VLAN ?
+                                                  NULL : &vlan_id,
+                                                  &if_index, &hoplimit);
+               if (ret) {
+                       dev_put(idev);
+                       return ret;
                }
 
-               ret = get_sgid_index_from_eth(device, port_num, vlan_id,
-                                             &grh->dgid, &gid_index);
+               resolved_dev = dev_get_by_index(&init_net, if_index);
+               if (resolved_dev->flags & IFF_LOOPBACK) {
+                       dev_put(resolved_dev);
+                       resolved_dev = idev;
+                       dev_hold(resolved_dev);
+               }
+               rcu_read_lock();
+               if (resolved_dev != idev && !rdma_is_upper_dev_rcu(idev,
+                                                                  resolved_dev))
+                       ret = -EHOSTUNREACH;
+               rcu_read_unlock();
+               dev_put(idev);
+               dev_put(resolved_dev);
                if (ret)
                        return ret;
 
-               if (wc->wc_flags & IB_WC_WITH_SMAC)
-                       memcpy(ah_attr->dmac, wc->smac, ETH_ALEN);
+               ret = get_sgid_index_from_eth(device, port_num, vlan_id,
+                                             &dgid, gid_type, &gid_index);
+               if (ret)
+                       return ret;
        }
 
        ah_attr->dlid = wc->slid;
@@ -383,10 +507,11 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num,
 
        if (wc->wc_flags & IB_WC_GRH) {
                ah_attr->ah_flags = IB_AH_GRH;
-               ah_attr->grh.dgid = grh->sgid;
+               ah_attr->grh.dgid = sgid;
 
                if (!rdma_cap_eth_ah(device, port_num)) {
-                       ret = ib_find_cached_gid_by_port(device, &grh->dgid,
+                       ret = ib_find_cached_gid_by_port(device, &dgid,
+                                                        IB_GID_TYPE_IB,
                                                         port_num, NULL,
                                                         &gid_index);
                        if (ret)
@@ -396,7 +521,7 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num,
                ah_attr->grh.sgid_index = (u8) gid_index;
                flow_class = be32_to_cpu(grh->version_tclass_flow);
                ah_attr->grh.flow_label = flow_class & 0xFFFFF;
-               ah_attr->grh.hop_limit = 0xFF;
+               ah_attr->grh.hop_limit = hoplimit;
                ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF;
        }
        return 0;
@@ -1014,6 +1139,7 @@ int ib_resolve_eth_dmac(struct ib_qp *qp,
                        union ib_gid            sgid;
                        struct ib_gid_attr      sgid_attr;
                        int                     ifindex;
+                       int                     hop_limit;
 
                        ret = ib_query_gid(qp->device,
                                           qp_attr->ah_attr.port_num,
@@ -1028,12 +1154,14 @@ int ib_resolve_eth_dmac(struct ib_qp *qp,
 
                        ifindex = sgid_attr.ndev->ifindex;
 
-                       ret = rdma_addr_find_dmac_by_grh(&sgid,
-                                                        &qp_attr->ah_attr.grh.dgid,
-                                                        qp_attr->ah_attr.dmac,
-                                                        NULL, ifindex);
+                       ret = rdma_addr_find_l2_eth_by_grh(&sgid,
+                                                          &qp_attr->ah_attr.grh.dgid,
+                                                          qp_attr->ah_attr.dmac,
+                                                          NULL, &ifindex, &hop_limit);
 
                        dev_put(sgid_attr.ndev);
+
+                       qp_attr->ah_attr.grh.hop_limit = hop_limit;
                }
        }
 out:
@@ -1215,29 +1343,17 @@ struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
                mr->pd      = pd;
                mr->uobject = NULL;
                atomic_inc(&pd->usecnt);
-               atomic_set(&mr->usecnt, 0);
        }
 
        return mr;
 }
 EXPORT_SYMBOL(ib_get_dma_mr);
 
-int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)
-{
-       return mr->device->query_mr ?
-               mr->device->query_mr(mr, mr_attr) : -ENOSYS;
-}
-EXPORT_SYMBOL(ib_query_mr);
-
 int ib_dereg_mr(struct ib_mr *mr)
 {
-       struct ib_pd *pd;
+       struct ib_pd *pd = mr->pd;
        int ret;
 
-       if (atomic_read(&mr->usecnt))
-               return -EBUSY;
-
-       pd = mr->pd;
        ret = mr->device->dereg_mr(mr);
        if (!ret)
                atomic_dec(&pd->usecnt);
@@ -1273,49 +1389,12 @@ struct ib_mr *ib_alloc_mr(struct ib_pd *pd,
                mr->pd      = pd;
                mr->uobject = NULL;
                atomic_inc(&pd->usecnt);
-               atomic_set(&mr->usecnt, 0);
        }
 
        return mr;
 }
 EXPORT_SYMBOL(ib_alloc_mr);
 
-/* Memory windows */
-
-struct ib_mw *ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type)
-{
-       struct ib_mw *mw;
-
-       if (!pd->device->alloc_mw)
-               return ERR_PTR(-ENOSYS);
-
-       mw = pd->device->alloc_mw(pd, type);
-       if (!IS_ERR(mw)) {
-               mw->device  = pd->device;
-               mw->pd      = pd;
-               mw->uobject = NULL;
-               mw->type    = type;
-               atomic_inc(&pd->usecnt);
-       }
-
-       return mw;
-}
-EXPORT_SYMBOL(ib_alloc_mw);
-
-int ib_dealloc_mw(struct ib_mw *mw)
-{
-       struct ib_pd *pd;
-       int ret;
-
-       pd = mw->pd;
-       ret = mw->device->dealloc_mw(mw);
-       if (!ret)
-               atomic_dec(&pd->usecnt);
-
-       return ret;
-}
-EXPORT_SYMBOL(ib_dealloc_mw);
-
 /* "Fast" memory regions */
 
 struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd,
@@ -1530,7 +1609,7 @@ int ib_sg_to_pages(struct ib_mr *mr,
                   int (*set_page)(struct ib_mr *, u64))
 {
        struct scatterlist *sg;
-       u64 last_end_dma_addr = 0, last_page_addr = 0;
+       u64 last_end_dma_addr = 0;
        unsigned int last_page_off = 0;
        u64 page_mask = ~((u64)mr->page_size - 1);
        int i, ret;
@@ -1572,7 +1651,6 @@ next_page:
 
                mr->length += dma_len;
                last_end_dma_addr = end_dma_addr;
-               last_page_addr = end_dma_addr & page_mask;
                last_page_off = end_dma_addr & ~page_mask;
        }
 
index cb78b1e9bcd9ec3035884e3c0c6504bf317dbfc8..f504ba73e5dc27200c988f342ee3d6423072ee9e 100644 (file)
@@ -149,7 +149,7 @@ static int iwch_l2t_send(struct t3cdev *tdev, struct sk_buff *skb, struct l2t_en
        error = l2t_send(tdev, skb, l2e);
        if (error < 0)
                kfree_skb(skb);
-       return error;
+       return error < 0 ? error : 0;
 }
 
 int iwch_cxgb3_ofld_send(struct t3cdev *tdev, struct sk_buff *skb)
@@ -165,7 +165,7 @@ int iwch_cxgb3_ofld_send(struct t3cdev *tdev, struct sk_buff *skb)
        error = cxgb3_ofld_send(tdev, skb);
        if (error < 0)
                kfree_skb(skb);
-       return error;
+       return error < 0 ? error : 0;
 }
 
 static void release_tid(struct t3cdev *tdev, u32 hwtid, struct sk_buff *skb)
index cfe404925a399f2be0cd1bdbc23c262ad1b24ca9..97fbfd2c298ecc431ba7049e28f13ea33ba8a520 100644 (file)
@@ -115,10 +115,6 @@ static int iwch_poll_cq_one(struct iwch_dev *rhp, struct iwch_cq *chp,
                case T3_SEND_WITH_SE_INV:
                        wc->opcode = IB_WC_SEND;
                        break;
-               case T3_BIND_MW:
-                       wc->opcode = IB_WC_BIND_MW;
-                       break;
-
                case T3_LOCAL_INV:
                        wc->opcode = IB_WC_LOCAL_INV;
                        break;
index 5c36ee2809acf52f8f212483dfb81e5649357baf..1d04c872c9d58c69a47dbb1b1c38f2ae69d80cde 100644 (file)
@@ -75,37 +75,6 @@ int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php,
        return ret;
 }
 
-int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php,
-                                       struct iwch_mr *mhp,
-                                       int shift,
-                                       int npages)
-{
-       u32 stag;
-       int ret;
-
-       /* We could support this... */
-       if (npages > mhp->attr.pbl_size)
-               return -ENOMEM;
-
-       stag = mhp->attr.stag;
-       if (cxio_reregister_phys_mem(&rhp->rdev,
-                                  &stag, mhp->attr.pdid,
-                                  mhp->attr.perms,
-                                  mhp->attr.zbva,
-                                  mhp->attr.va_fbo,
-                                  mhp->attr.len,
-                                  shift - 12,
-                                  mhp->attr.pbl_size, mhp->attr.pbl_addr))
-               return -ENOMEM;
-
-       ret = iwch_finish_mem_reg(mhp, stag);
-       if (ret)
-               cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
-                      mhp->attr.pbl_addr);
-
-       return ret;
-}
-
 int iwch_alloc_pbl(struct iwch_mr *mhp, int npages)
 {
        mhp->attr.pbl_addr = cxio_hal_pblpool_alloc(&mhp->rhp->rdev,
@@ -130,74 +99,3 @@ int iwch_write_pbl(struct iwch_mr *mhp, __be64 *pages, int npages, int offset)
        return cxio_write_pbl(&mhp->rhp->rdev, pages,
                              mhp->attr.pbl_addr + (offset << 3), npages);
 }
-
-int build_phys_page_list(struct ib_phys_buf *buffer_list,
-                                       int num_phys_buf,
-                                       u64 *iova_start,
-                                       u64 *total_size,
-                                       int *npages,
-                                       int *shift,
-                                       __be64 **page_list)
-{
-       u64 mask;
-       int i, j, n;
-
-       mask = 0;
-       *total_size = 0;
-       for (i = 0; i < num_phys_buf; ++i) {
-               if (i != 0 && buffer_list[i].addr & ~PAGE_MASK)
-                       return -EINVAL;
-               if (i != 0 && i != num_phys_buf - 1 &&
-                   (buffer_list[i].size & ~PAGE_MASK))
-                       return -EINVAL;
-               *total_size += buffer_list[i].size;
-               if (i > 0)
-                       mask |= buffer_list[i].addr;
-               else
-                       mask |= buffer_list[i].addr & PAGE_MASK;
-               if (i != num_phys_buf - 1)
-                       mask |= buffer_list[i].addr + buffer_list[i].size;
-               else
-                       mask |= (buffer_list[i].addr + buffer_list[i].size +
-                               PAGE_SIZE - 1) & PAGE_MASK;
-       }
-
-       if (*total_size > 0xFFFFFFFFULL)
-               return -ENOMEM;
-
-       /* Find largest page shift we can use to cover buffers */
-       for (*shift = PAGE_SHIFT; *shift < 27; ++(*shift))
-               if ((1ULL << *shift) & mask)
-                       break;
-
-       buffer_list[0].size += buffer_list[0].addr & ((1ULL << *shift) - 1);
-       buffer_list[0].addr &= ~0ull << *shift;
-
-       *npages = 0;
-       for (i = 0; i < num_phys_buf; ++i)
-               *npages += (buffer_list[i].size +
-                       (1ULL << *shift) - 1) >> *shift;
-
-       if (!*npages)
-               return -EINVAL;
-
-       *page_list = kmalloc(sizeof(u64) * *npages, GFP_KERNEL);
-       if (!*page_list)
-               return -ENOMEM;
-
-       n = 0;
-       for (i = 0; i < num_phys_buf; ++i)
-               for (j = 0;
-                    j < (buffer_list[i].size + (1ULL << *shift) - 1) >> *shift;
-                    ++j)
-                       (*page_list)[n++] = cpu_to_be64(buffer_list[i].addr +
-                           ((u64) j << *shift));
-
-       PDBG("%s va 0x%llx mask 0x%llx shift %d len %lld pbl_size %d\n",
-            __func__, (unsigned long long) *iova_start,
-            (unsigned long long) mask, *shift, (unsigned long long) *total_size,
-            *npages);
-
-       return 0;
-
-}
index c34725ca0bb426606e16708fafab2995490d7187..2734820d291b02387b8e925bb5f09ab6e23418dd 100644 (file)
@@ -458,9 +458,6 @@ static int iwch_dereg_mr(struct ib_mr *ib_mr)
        u32 mmid;
 
        PDBG("%s ib_mr %p\n", __func__, ib_mr);
-       /* There can be no memory windows */
-       if (atomic_read(&ib_mr->usecnt))
-               return -EINVAL;
 
        mhp = to_iwch_mr(ib_mr);
        kfree(mhp->pages);
@@ -479,24 +476,25 @@ static int iwch_dereg_mr(struct ib_mr *ib_mr)
        return 0;
 }
 
-static struct ib_mr *iwch_register_phys_mem(struct ib_pd *pd,
-                                       struct ib_phys_buf *buffer_list,
-                                       int num_phys_buf,
-                                       int acc,
-                                       u64 *iova_start)
+static struct ib_mr *iwch_get_dma_mr(struct ib_pd *pd, int acc)
 {
-       __be64 *page_list;
-       int shift;
-       u64 total_size;
-       int npages;
-       struct iwch_dev *rhp;
-       struct iwch_pd *php;
+       const u64 total_size = 0xffffffff;
+       const u64 mask = (total_size + PAGE_SIZE - 1) & PAGE_MASK;
+       struct iwch_pd *php = to_iwch_pd(pd);
+       struct iwch_dev *rhp = php->rhp;
        struct iwch_mr *mhp;
-       int ret;
+       __be64 *page_list;
+       int shift = 26, npages, ret, i;
 
        PDBG("%s ib_pd %p\n", __func__, pd);
-       php = to_iwch_pd(pd);
-       rhp = php->rhp;
+
+       /*
+        * T3 only supports 32 bits of size.
+        */
+       if (sizeof(phys_addr_t) > 4) {
+               pr_warn_once(MOD "Cannot support dma_mrs on this platform.\n");
+               return ERR_PTR(-ENOTSUPP);
+       }
 
        mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
        if (!mhp)
@@ -504,22 +502,23 @@ static struct ib_mr *iwch_register_phys_mem(struct ib_pd *pd,
 
        mhp->rhp = rhp;
 
-       /* First check that we have enough alignment */
-       if ((*iova_start & ~PAGE_MASK) != (buffer_list[0].addr & ~PAGE_MASK)) {
+       npages = (total_size + (1ULL << shift) - 1) >> shift;
+       if (!npages) {
                ret = -EINVAL;
                goto err;
        }
 
-       if (num_phys_buf > 1 &&
-           ((buffer_list[0].addr + buffer_list[0].size) & ~PAGE_MASK)) {
-               ret = -EINVAL;
+       page_list = kmalloc_array(npages, sizeof(u64), GFP_KERNEL);
+       if (!page_list) {
+               ret = -ENOMEM;
                goto err;
        }
 
-       ret = build_phys_page_list(buffer_list, num_phys_buf, iova_start,
-                                  &total_size, &npages, &shift, &page_list);
-       if (ret)
-               goto err;
+       for (i = 0; i < npages; i++)
+               page_list[i] = cpu_to_be64((u64)i << shift);
+
+       PDBG("%s mask 0x%llx shift %d len %lld pbl_size %d\n",
+               __func__, mask, shift, total_size, npages);
 
        ret = iwch_alloc_pbl(mhp, npages);
        if (ret) {
@@ -536,7 +535,7 @@ static struct ib_mr *iwch_register_phys_mem(struct ib_pd *pd,
        mhp->attr.zbva = 0;
 
        mhp->attr.perms = iwch_ib_to_tpt_access(acc);
-       mhp->attr.va_fbo = *iova_start;
+       mhp->attr.va_fbo = 0;
        mhp->attr.page_size = shift - 12;
 
        mhp->attr.len = (u32) total_size;
@@ -553,76 +552,8 @@ err_pbl:
 err:
        kfree(mhp);
        return ERR_PTR(ret);
-
-}
-
-static int iwch_reregister_phys_mem(struct ib_mr *mr,
-                                    int mr_rereg_mask,
-                                    struct ib_pd *pd,
-                                    struct ib_phys_buf *buffer_list,
-                                    int num_phys_buf,
-                                    int acc, u64 * iova_start)
-{
-
-       struct iwch_mr mh, *mhp;
-       struct iwch_pd *php;
-       struct iwch_dev *rhp;
-       __be64 *page_list = NULL;
-       int shift = 0;
-       u64 total_size;
-       int npages = 0;
-       int ret;
-
-       PDBG("%s ib_mr %p ib_pd %p\n", __func__, mr, pd);
-
-       /* There can be no memory windows */
-       if (atomic_read(&mr->usecnt))
-               return -EINVAL;
-
-       mhp = to_iwch_mr(mr);
-       rhp = mhp->rhp;
-       php = to_iwch_pd(mr->pd);
-
-       /* make sure we are on the same adapter */
-       if (rhp != php->rhp)
-               return -EINVAL;
-
-       memcpy(&mh, mhp, sizeof *mhp);
-
-       if (mr_rereg_mask & IB_MR_REREG_PD)
-               php = to_iwch_pd(pd);
-       if (mr_rereg_mask & IB_MR_REREG_ACCESS)
-               mh.attr.perms = iwch_ib_to_tpt_access(acc);
-       if (mr_rereg_mask & IB_MR_REREG_TRANS) {
-               ret = build_phys_page_list(buffer_list, num_phys_buf,
-                                          iova_start,
-                                          &total_size, &npages,
-                                          &shift, &page_list);
-               if (ret)
-                       return ret;
-       }
-
-       ret = iwch_reregister_mem(rhp, php, &mh, shift, npages);
-       kfree(page_list);
-       if (ret) {
-               return ret;
-       }
-       if (mr_rereg_mask & IB_MR_REREG_PD)
-               mhp->attr.pdid = php->pdid;
-       if (mr_rereg_mask & IB_MR_REREG_ACCESS)
-               mhp->attr.perms = iwch_ib_to_tpt_access(acc);
-       if (mr_rereg_mask & IB_MR_REREG_TRANS) {
-               mhp->attr.zbva = 0;
-               mhp->attr.va_fbo = *iova_start;
-               mhp->attr.page_size = shift - 12;
-               mhp->attr.len = (u32) total_size;
-               mhp->attr.pbl_size = npages;
-       }
-
-       return 0;
 }
 
-
 static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
                                      u64 virt, int acc, struct ib_udata *udata)
 {
@@ -726,28 +657,6 @@ err:
        return ERR_PTR(err);
 }
 
-static struct ib_mr *iwch_get_dma_mr(struct ib_pd *pd, int acc)
-{
-       struct ib_phys_buf bl;
-       u64 kva;
-       struct ib_mr *ibmr;
-
-       PDBG("%s ib_pd %p\n", __func__, pd);
-
-       /*
-        * T3 only supports 32 bits of size.
-        */
-       if (sizeof(phys_addr_t) > 4) {
-               pr_warn_once(MOD "Cannot support dma_mrs on this platform.\n");
-               return ERR_PTR(-ENOTSUPP);
-       }
-       bl.size = 0xffffffff;
-       bl.addr = 0;
-       kva = 0;
-       ibmr = iwch_register_phys_mem(pd, &bl, 1, acc, &kva);
-       return ibmr;
-}
-
 static struct ib_mw *iwch_alloc_mw(struct ib_pd *pd, enum ib_mw_type type)
 {
        struct iwch_dev *rhp;
@@ -1452,12 +1361,9 @@ int iwch_register_device(struct iwch_dev *dev)
        dev->ibdev.resize_cq = iwch_resize_cq;
        dev->ibdev.poll_cq = iwch_poll_cq;
        dev->ibdev.get_dma_mr = iwch_get_dma_mr;
-       dev->ibdev.reg_phys_mr = iwch_register_phys_mem;
-       dev->ibdev.rereg_phys_mr = iwch_reregister_phys_mem;
        dev->ibdev.reg_user_mr = iwch_reg_user_mr;
        dev->ibdev.dereg_mr = iwch_dereg_mr;
        dev->ibdev.alloc_mw = iwch_alloc_mw;
-       dev->ibdev.bind_mw = iwch_bind_mw;
        dev->ibdev.dealloc_mw = iwch_dealloc_mw;
        dev->ibdev.alloc_mr = iwch_alloc_mr;
        dev->ibdev.map_mr_sg = iwch_map_mr_sg;
index 2ac85b86a680dea89becaf76c75956cda42a1644..252c464a09f6a7ef6c4360a7449ba973e36369c0 100644 (file)
@@ -330,9 +330,6 @@ int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
                      struct ib_send_wr **bad_wr);
 int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
                      struct ib_recv_wr **bad_wr);
-int iwch_bind_mw(struct ib_qp *qp,
-                            struct ib_mw *mw,
-                            struct ib_mw_bind *mw_bind);
 int iwch_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
 int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg);
 int iwch_post_zb_read(struct iwch_ep *ep);
@@ -341,21 +338,9 @@ void iwch_unregister_device(struct iwch_dev *dev);
 void stop_read_rep_timer(struct iwch_qp *qhp);
 int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php,
                      struct iwch_mr *mhp, int shift);
-int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php,
-                                       struct iwch_mr *mhp,
-                                       int shift,
-                                       int npages);
 int iwch_alloc_pbl(struct iwch_mr *mhp, int npages);
 void iwch_free_pbl(struct iwch_mr *mhp);
 int iwch_write_pbl(struct iwch_mr *mhp, __be64 *pages, int npages, int offset);
-int build_phys_page_list(struct ib_phys_buf *buffer_list,
-                                       int num_phys_buf,
-                                       u64 *iova_start,
-                                       u64 *total_size,
-                                       int *npages,
-                                       int *shift,
-                                       __be64 **page_list);
-
 
 #define IWCH_NODE_DESC "cxgb3 Chelsio Communications"
 
index d0548fc6395eac2343775ea4a4b85d3c83bb98b0..d939980a708fdcf1daa6a7a53686cb6509ef2bb5 100644 (file)
@@ -526,88 +526,6 @@ out:
        return err;
 }
 
-int iwch_bind_mw(struct ib_qp *qp,
-                            struct ib_mw *mw,
-                            struct ib_mw_bind *mw_bind)
-{
-       struct iwch_dev *rhp;
-       struct iwch_mw *mhp;
-       struct iwch_qp *qhp;
-       union t3_wr *wqe;
-       u32 pbl_addr;
-       u8 page_size;
-       u32 num_wrs;
-       unsigned long flag;
-       struct ib_sge sgl;
-       int err=0;
-       enum t3_wr_flags t3_wr_flags;
-       u32 idx;
-       struct t3_swsq *sqp;
-
-       qhp = to_iwch_qp(qp);
-       mhp = to_iwch_mw(mw);
-       rhp = qhp->rhp;
-
-       spin_lock_irqsave(&qhp->lock, flag);
-       if (qhp->attr.state > IWCH_QP_STATE_RTS) {
-               spin_unlock_irqrestore(&qhp->lock, flag);
-               return -EINVAL;
-       }
-       num_wrs = Q_FREECNT(qhp->wq.sq_rptr, qhp->wq.sq_wptr,
-                           qhp->wq.sq_size_log2);
-       if (num_wrs == 0) {
-               spin_unlock_irqrestore(&qhp->lock, flag);
-               return -ENOMEM;
-       }
-       idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2);
-       PDBG("%s: idx 0x%0x, mw 0x%p, mw_bind 0x%p\n", __func__, idx,
-            mw, mw_bind);
-       wqe = (union t3_wr *) (qhp->wq.queue + idx);
-
-       t3_wr_flags = 0;
-       if (mw_bind->send_flags & IB_SEND_SIGNALED)
-               t3_wr_flags = T3_COMPLETION_FLAG;
-
-       sgl.addr = mw_bind->bind_info.addr;
-       sgl.lkey = mw_bind->bind_info.mr->lkey;
-       sgl.length = mw_bind->bind_info.length;
-       wqe->bind.reserved = 0;
-       wqe->bind.type = TPT_VATO;
-
-       /* TBD: check perms */
-       wqe->bind.perms = iwch_ib_to_tpt_bind_access(
-               mw_bind->bind_info.mw_access_flags);
-       wqe->bind.mr_stag = cpu_to_be32(mw_bind->bind_info.mr->lkey);
-       wqe->bind.mw_stag = cpu_to_be32(mw->rkey);
-       wqe->bind.mw_len = cpu_to_be32(mw_bind->bind_info.length);
-       wqe->bind.mw_va = cpu_to_be64(mw_bind->bind_info.addr);
-       err = iwch_sgl2pbl_map(rhp, &sgl, 1, &pbl_addr, &page_size);
-       if (err) {
-               spin_unlock_irqrestore(&qhp->lock, flag);
-               return err;
-       }
-       wqe->send.wrid.id0.hi = qhp->wq.sq_wptr;
-       sqp = qhp->wq.sq + Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2);
-       sqp->wr_id = mw_bind->wr_id;
-       sqp->opcode = T3_BIND_MW;
-       sqp->sq_wptr = qhp->wq.sq_wptr;
-       sqp->complete = 0;
-       sqp->signaled = (mw_bind->send_flags & IB_SEND_SIGNALED);
-       wqe->bind.mr_pbl_addr = cpu_to_be32(pbl_addr);
-       wqe->bind.mr_pagesz = page_size;
-       build_fw_riwrh((void *)wqe, T3_WR_BIND, t3_wr_flags,
-                      Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2), 0,
-                      sizeof(struct t3_bind_mw_wr) >> 3, T3_SOPEOP);
-       ++(qhp->wq.wptr);
-       ++(qhp->wq.sq_wptr);
-       spin_unlock_irqrestore(&qhp->lock, flag);
-
-       if (cxio_wq_db_enabled(&qhp->wq))
-               ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid);
-
-       return err;
-}
-
 static inline void build_term_codes(struct respQ_msg_t *rsp_msg,
                                    u8 *layer_type, u8 *ecode)
 {
index 326d07d823a5e62f0ee206b8c5f5865f7f97c08d..cd2ff5f9518a2b7143d53821672a5d98b9773b17 100644 (file)
@@ -3271,6 +3271,12 @@ static int create_server6(struct c4iw_dev *dev, struct c4iw_listen_ep *ep)
        struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)
                                    &ep->com.mapped_local_addr;
 
+       if (ipv6_addr_type(&sin6->sin6_addr) != IPV6_ADDR_ANY) {
+               err = cxgb4_clip_get(ep->com.dev->rdev.lldi.ports[0],
+                                    (const u32 *)&sin6->sin6_addr.s6_addr, 1);
+               if (err)
+                       return err;
+       }
        c4iw_init_wr_wait(&ep->com.wr_wait);
        err = cxgb4_create_server6(ep->com.dev->rdev.lldi.ports[0],
                                   ep->stid, &sin6->sin6_addr,
@@ -3282,13 +3288,13 @@ static int create_server6(struct c4iw_dev *dev, struct c4iw_listen_ep *ep)
                                          0, 0, __func__);
        else if (err > 0)
                err = net_xmit_errno(err);
-       if (err)
+       if (err) {
+               cxgb4_clip_release(ep->com.dev->rdev.lldi.ports[0],
+                                  (const u32 *)&sin6->sin6_addr.s6_addr, 1);
                pr_err("cxgb4_create_server6/filter failed err %d stid %d laddr %pI6 lport %d\n",
                       err, ep->stid,
                       sin6->sin6_addr.s6_addr, ntohs(sin6->sin6_port));
-       else
-               cxgb4_clip_get(ep->com.dev->rdev.lldi.ports[0],
-                              (const u32 *)&sin6->sin6_addr.s6_addr, 1);
+       }
        return err;
 }
 
index de9cd6901752fc1e3da38d64f62bfce7cb853501..cf21df4a8bf5b80da00f6f82003980650df9c522 100644 (file)
@@ -744,9 +744,6 @@ static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ib_wc *wc)
                case FW_RI_SEND_WITH_SE:
                        wc->opcode = IB_WC_SEND;
                        break;
-               case FW_RI_BIND_MW:
-                       wc->opcode = IB_WC_BIND_MW;
-                       break;
 
                case FW_RI_LOCAL_INV:
                        wc->opcode = IB_WC_LOCAL_INV;
index 58fce1742b8d8c33b91dfa89075617a9c5cf34f7..8024ea4417b8735b93e77dc9310314db44b8833e 100644 (file)
@@ -315,14 +315,12 @@ static int qp_release(struct inode *inode, struct file *file)
 static int qp_open(struct inode *inode, struct file *file)
 {
        struct c4iw_debugfs_data *qpd;
-       int ret = 0;
        int count = 1;
 
        qpd = kmalloc(sizeof *qpd, GFP_KERNEL);
-       if (!qpd) {
-               ret = -ENOMEM;
-               goto out;
-       }
+       if (!qpd)
+               return -ENOMEM;
+
        qpd->devp = inode->i_private;
        qpd->pos = 0;
 
@@ -333,8 +331,8 @@ static int qp_open(struct inode *inode, struct file *file)
        qpd->bufsize = count * 128;
        qpd->buf = vmalloc(qpd->bufsize);
        if (!qpd->buf) {
-               ret = -ENOMEM;
-               goto err1;
+               kfree(qpd);
+               return -ENOMEM;
        }
 
        spin_lock_irq(&qpd->devp->lock);
@@ -343,11 +341,7 @@ static int qp_open(struct inode *inode, struct file *file)
 
        qpd->buf[qpd->pos++] = 0;
        file->private_data = qpd;
-       goto out;
-err1:
-       kfree(qpd);
-out:
-       return ret;
+       return 0;
 }
 
 static const struct file_operations qp_debugfs_fops = {
@@ -781,8 +775,7 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev)
                pr_err(MOD "%s: unsupported udb/ucq densities %u/%u\n",
                       pci_name(rdev->lldi.pdev), rdev->lldi.udb_density,
                       rdev->lldi.ucq_density);
-               err = -EINVAL;
-               goto err1;
+               return -EINVAL;
        }
        if (rdev->lldi.vr->qp.start != rdev->lldi.vr->cq.start ||
            rdev->lldi.vr->qp.size != rdev->lldi.vr->cq.size) {
@@ -791,8 +784,7 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev)
                       pci_name(rdev->lldi.pdev), rdev->lldi.vr->qp.start,
                       rdev->lldi.vr->qp.size, rdev->lldi.vr->cq.size,
                       rdev->lldi.vr->cq.size);
-               err = -EINVAL;
-               goto err1;
+               return -EINVAL;
        }
 
        rdev->qpmask = rdev->lldi.udb_density - 1;
@@ -816,10 +808,8 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev)
             rdev->lldi.db_reg, rdev->lldi.gts_reg,
             rdev->qpmask, rdev->cqmask);
 
-       if (c4iw_num_stags(rdev) == 0) {
-               err = -EINVAL;
-               goto err1;
-       }
+       if (c4iw_num_stags(rdev) == 0)
+               return -EINVAL;
 
        rdev->stats.pd.total = T4_MAX_NUM_PD;
        rdev->stats.stag.total = rdev->lldi.vr->stag.size;
@@ -831,29 +821,31 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev)
        err = c4iw_init_resource(rdev, c4iw_num_stags(rdev), T4_MAX_NUM_PD);
        if (err) {
                printk(KERN_ERR MOD "error %d initializing resources\n", err);
-               goto err1;
+               return err;
        }
        err = c4iw_pblpool_create(rdev);
        if (err) {
                printk(KERN_ERR MOD "error %d initializing pbl pool\n", err);
-               goto err2;
+               goto destroy_resource;
        }
        err = c4iw_rqtpool_create(rdev);
        if (err) {
                printk(KERN_ERR MOD "error %d initializing rqt pool\n", err);
-               goto err3;
+               goto destroy_pblpool;
        }
        err = c4iw_ocqp_pool_create(rdev);
        if (err) {
                printk(KERN_ERR MOD "error %d initializing ocqp pool\n", err);
-               goto err4;
+               goto destroy_rqtpool;
        }
        rdev->status_page = (struct t4_dev_status_page *)
                            __get_free_page(GFP_KERNEL);
-       if (!rdev->status_page) {
-               pr_err(MOD "error allocating status page\n");
-               goto err4;
-       }
+       if (!rdev->status_page)
+               goto destroy_ocqp_pool;
+       rdev->status_page->qp_start = rdev->lldi.vr->qp.start;
+       rdev->status_page->qp_size = rdev->lldi.vr->qp.size;
+       rdev->status_page->cq_start = rdev->lldi.vr->cq.start;
+       rdev->status_page->cq_size = rdev->lldi.vr->cq.size;
 
        if (c4iw_wr_log) {
                rdev->wr_log = kzalloc((1 << c4iw_wr_log_size_order) *
@@ -869,13 +861,14 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev)
        rdev->status_page->db_off = 0;
 
        return 0;
-err4:
+destroy_ocqp_pool:
+       c4iw_ocqp_pool_destroy(rdev);
+destroy_rqtpool:
        c4iw_rqtpool_destroy(rdev);
-err3:
+destroy_pblpool:
        c4iw_pblpool_destroy(rdev);
-err2:
+destroy_resource:
        c4iw_destroy_resource(&rdev->resource);
-err1:
        return err;
 }
 
index 00e55faa086aa2a30259c5c8b96cc0ae129a4ef6..fb2de75a039216e2f732a8f61ba387de121f1ce7 100644 (file)
@@ -947,8 +947,6 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
                      struct ib_send_wr **bad_wr);
 int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
                      struct ib_recv_wr **bad_wr);
-int c4iw_bind_mw(struct ib_qp *qp, struct ib_mw *mw,
-                struct ib_mw_bind *mw_bind);
 int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param);
 int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog);
 int c4iw_destroy_listen(struct iw_cm_id *cm_id);
@@ -968,17 +966,6 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start,
                                           u64 length, u64 virt, int acc,
                                           struct ib_udata *udata);
 struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc);
-struct ib_mr *c4iw_register_phys_mem(struct ib_pd *pd,
-                                       struct ib_phys_buf *buffer_list,
-                                       int num_phys_buf,
-                                       int acc,
-                                       u64 *iova_start);
-int c4iw_reregister_phys_mem(struct ib_mr *mr,
-                                    int mr_rereg_mask,
-                                    struct ib_pd *pd,
-                                    struct ib_phys_buf *buffer_list,
-                                    int num_phys_buf,
-                                    int acc, u64 *iova_start);
 int c4iw_dereg_mr(struct ib_mr *ib_mr);
 int c4iw_destroy_cq(struct ib_cq *ib_cq);
 struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
index e1629ab58db7873a3d9a6c044ba7bc65eb4512c6..7849890c478141e57136127c84f4cad9fd46227d 100644 (file)
@@ -392,32 +392,6 @@ static int register_mem(struct c4iw_dev *rhp, struct c4iw_pd *php,
        return ret;
 }
 
-static int reregister_mem(struct c4iw_dev *rhp, struct c4iw_pd *php,
-                         struct c4iw_mr *mhp, int shift, int npages)
-{
-       u32 stag;
-       int ret;
-
-       if (npages > mhp->attr.pbl_size)
-               return -ENOMEM;
-
-       stag = mhp->attr.stag;
-       ret = write_tpt_entry(&rhp->rdev, 0, &stag, 1, mhp->attr.pdid,
-                             FW_RI_STAG_NSMR, mhp->attr.perms,
-                             mhp->attr.mw_bind_enable, mhp->attr.zbva,
-                             mhp->attr.va_fbo, mhp->attr.len, shift - 12,
-                             mhp->attr.pbl_size, mhp->attr.pbl_addr);
-       if (ret)
-               return ret;
-
-       ret = finish_mem_reg(mhp, stag);
-       if (ret)
-               dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
-                      mhp->attr.pbl_addr);
-
-       return ret;
-}
-
 static int alloc_pbl(struct c4iw_mr *mhp, int npages)
 {
        mhp->attr.pbl_addr = c4iw_pblpool_alloc(&mhp->rhp->rdev,
@@ -431,228 +405,6 @@ static int alloc_pbl(struct c4iw_mr *mhp, int npages)
        return 0;
 }
 
-static int build_phys_page_list(struct ib_phys_buf *buffer_list,
-                               int num_phys_buf, u64 *iova_start,
-                               u64 *total_size, int *npages,
-                               int *shift, __be64 **page_list)
-{
-       u64 mask;
-       int i, j, n;
-
-       mask = 0;
-       *total_size = 0;
-       for (i = 0; i < num_phys_buf; ++i) {
-               if (i != 0 && buffer_list[i].addr & ~PAGE_MASK)
-                       return -EINVAL;
-               if (i != 0 && i != num_phys_buf - 1 &&
-                   (buffer_list[i].size & ~PAGE_MASK))
-                       return -EINVAL;
-               *total_size += buffer_list[i].size;
-               if (i > 0)
-                       mask |= buffer_list[i].addr;
-               else
-                       mask |= buffer_list[i].addr & PAGE_MASK;
-               if (i != num_phys_buf - 1)
-                       mask |= buffer_list[i].addr + buffer_list[i].size;
-               else
-                       mask |= (buffer_list[i].addr + buffer_list[i].size +
-                               PAGE_SIZE - 1) & PAGE_MASK;
-       }
-
-       if (*total_size > 0xFFFFFFFFULL)
-               return -ENOMEM;
-
-       /* Find largest page shift we can use to cover buffers */
-       for (*shift = PAGE_SHIFT; *shift < 27; ++(*shift))
-               if ((1ULL << *shift) & mask)
-                       break;
-
-       buffer_list[0].size += buffer_list[0].addr & ((1ULL << *shift) - 1);
-       buffer_list[0].addr &= ~0ull << *shift;
-
-       *npages = 0;
-       for (i = 0; i < num_phys_buf; ++i)
-               *npages += (buffer_list[i].size +
-                       (1ULL << *shift) - 1) >> *shift;
-
-       if (!*npages)
-               return -EINVAL;
-
-       *page_list = kmalloc(sizeof(u64) * *npages, GFP_KERNEL);
-       if (!*page_list)
-               return -ENOMEM;
-
-       n = 0;
-       for (i = 0; i < num_phys_buf; ++i)
-               for (j = 0;
-                    j < (buffer_list[i].size + (1ULL << *shift) - 1) >> *shift;
-                    ++j)
-                       (*page_list)[n++] = cpu_to_be64(buffer_list[i].addr +
-                           ((u64) j << *shift));
-
-       PDBG("%s va 0x%llx mask 0x%llx shift %d len %lld pbl_size %d\n",
-            __func__, (unsigned long long)*iova_start,
-            (unsigned long long)mask, *shift, (unsigned long long)*total_size,
-            *npages);
-
-       return 0;
-
-}
-
-int c4iw_reregister_phys_mem(struct ib_mr *mr, int mr_rereg_mask,
-                            struct ib_pd *pd, struct ib_phys_buf *buffer_list,
-                            int num_phys_buf, int acc, u64 *iova_start)
-{
-
-       struct c4iw_mr mh, *mhp;
-       struct c4iw_pd *php;
-       struct c4iw_dev *rhp;
-       __be64 *page_list = NULL;
-       int shift = 0;
-       u64 total_size;
-       int npages;
-       int ret;
-
-       PDBG("%s ib_mr %p ib_pd %p\n", __func__, mr, pd);
-
-       /* There can be no memory windows */
-       if (atomic_read(&mr->usecnt))
-               return -EINVAL;
-
-       mhp = to_c4iw_mr(mr);
-       rhp = mhp->rhp;
-       php = to_c4iw_pd(mr->pd);
-
-       /* make sure we are on the same adapter */
-       if (rhp != php->rhp)
-               return -EINVAL;
-
-       memcpy(&mh, mhp, sizeof *mhp);
-
-       if (mr_rereg_mask & IB_MR_REREG_PD)
-               php = to_c4iw_pd(pd);
-       if (mr_rereg_mask & IB_MR_REREG_ACCESS) {
-               mh.attr.perms = c4iw_ib_to_tpt_access(acc);
-               mh.attr.mw_bind_enable = (acc & IB_ACCESS_MW_BIND) ==
-                                        IB_ACCESS_MW_BIND;
-       }
-       if (mr_rereg_mask & IB_MR_REREG_TRANS) {
-               ret = build_phys_page_list(buffer_list, num_phys_buf,
-                                               iova_start,
-                                               &total_size, &npages,
-                                               &shift, &page_list);
-               if (ret)
-                       return ret;
-       }
-
-       if (mr_exceeds_hw_limits(rhp, total_size)) {
-               kfree(page_list);
-               return -EINVAL;
-       }
-
-       ret = reregister_mem(rhp, php, &mh, shift, npages);
-       kfree(page_list);
-       if (ret)
-               return ret;
-       if (mr_rereg_mask & IB_MR_REREG_PD)
-               mhp->attr.pdid = php->pdid;
-       if (mr_rereg_mask & IB_MR_REREG_ACCESS)
-               mhp->attr.perms = c4iw_ib_to_tpt_access(acc);
-       if (mr_rereg_mask & IB_MR_REREG_TRANS) {
-               mhp->attr.zbva = 0;
-               mhp->attr.va_fbo = *iova_start;
-               mhp->attr.page_size = shift - 12;
-               mhp->attr.len = (u32) total_size;
-               mhp->attr.pbl_size = npages;
-       }
-
-       return 0;
-}
-
-struct ib_mr *c4iw_register_phys_mem(struct ib_pd *pd,
-                                    struct ib_phys_buf *buffer_list,
-                                    int num_phys_buf, int acc, u64 *iova_start)
-{
-       __be64 *page_list;
-       int shift;
-       u64 total_size;
-       int npages;
-       struct c4iw_dev *rhp;
-       struct c4iw_pd *php;
-       struct c4iw_mr *mhp;
-       int ret;
-
-       PDBG("%s ib_pd %p\n", __func__, pd);
-       php = to_c4iw_pd(pd);
-       rhp = php->rhp;
-
-       mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
-       if (!mhp)
-               return ERR_PTR(-ENOMEM);
-
-       mhp->rhp = rhp;
-
-       /* First check that we have enough alignment */
-       if ((*iova_start & ~PAGE_MASK) != (buffer_list[0].addr & ~PAGE_MASK)) {
-               ret = -EINVAL;
-               goto err;
-       }
-
-       if (num_phys_buf > 1 &&
-           ((buffer_list[0].addr + buffer_list[0].size) & ~PAGE_MASK)) {
-               ret = -EINVAL;
-               goto err;
-       }
-
-       ret = build_phys_page_list(buffer_list, num_phys_buf, iova_start,
-                                       &total_size, &npages, &shift,
-                                       &page_list);
-       if (ret)
-               goto err;
-
-       if (mr_exceeds_hw_limits(rhp, total_size)) {
-               kfree(page_list);
-               ret = -EINVAL;
-               goto err;
-       }
-
-       ret = alloc_pbl(mhp, npages);
-       if (ret) {
-               kfree(page_list);
-               goto err;
-       }
-
-       ret = write_pbl(&mhp->rhp->rdev, page_list, mhp->attr.pbl_addr,
-                            npages);
-       kfree(page_list);
-       if (ret)
-               goto err_pbl;
-
-       mhp->attr.pdid = php->pdid;
-       mhp->attr.zbva = 0;
-
-       mhp->attr.perms = c4iw_ib_to_tpt_access(acc);
-       mhp->attr.va_fbo = *iova_start;
-       mhp->attr.page_size = shift - 12;
-
-       mhp->attr.len = (u32) total_size;
-       mhp->attr.pbl_size = npages;
-       ret = register_mem(rhp, php, mhp, shift);
-       if (ret)
-               goto err_pbl;
-
-       return &mhp->ibmr;
-
-err_pbl:
-       c4iw_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr,
-                             mhp->attr.pbl_size << 3);
-
-err:
-       kfree(mhp);
-       return ERR_PTR(ret);
-
-}
-
 struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc)
 {
        struct c4iw_dev *rhp;
@@ -952,9 +704,6 @@ int c4iw_dereg_mr(struct ib_mr *ib_mr)
        u32 mmid;
 
        PDBG("%s ib_mr %p\n", __func__, ib_mr);
-       /* There can be no memory windows */
-       if (atomic_read(&ib_mr->usecnt))
-               return -EINVAL;
 
        mhp = to_c4iw_mr(ib_mr);
        rhp = mhp->rhp;
index 0a7d99818b17d13384e32cb81446bd250647f85f..ec04272fbdc2ffbf882318678332a506c1f1d4ec 100644 (file)
@@ -549,12 +549,9 @@ int c4iw_register_device(struct c4iw_dev *dev)
        dev->ibdev.resize_cq = c4iw_resize_cq;
        dev->ibdev.poll_cq = c4iw_poll_cq;
        dev->ibdev.get_dma_mr = c4iw_get_dma_mr;
-       dev->ibdev.reg_phys_mr = c4iw_register_phys_mem;
-       dev->ibdev.rereg_phys_mr = c4iw_reregister_phys_mem;
        dev->ibdev.reg_user_mr = c4iw_reg_user_mr;
        dev->ibdev.dereg_mr = c4iw_dereg_mr;
        dev->ibdev.alloc_mw = c4iw_alloc_mw;
-       dev->ibdev.bind_mw = c4iw_bind_mw;
        dev->ibdev.dealloc_mw = c4iw_dealloc_mw;
        dev->ibdev.alloc_mr = c4iw_alloc_mr;
        dev->ibdev.map_mr_sg = c4iw_map_mr_sg;
index aa515afee7248823428cb2b725bf10c70f4fd82a..e99345eb875aa286a8b962696eba9742189e660c 100644 (file)
@@ -933,11 +933,6 @@ int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
        return err;
 }
 
-int c4iw_bind_mw(struct ib_qp *qp, struct ib_mw *mw, struct ib_mw_bind *mw_bind)
-{
-       return -ENOSYS;
-}
-
 static inline void build_term_codes(struct t4_cqe *err_cqe, u8 *layer_type,
                                    u8 *ecode)
 {
index 1092a2d1f607464152fbd2d7e836e180d2131312..6126bbe36095c21021f9cc5cdca3b75da299dc85 100644 (file)
@@ -699,4 +699,11 @@ static inline void t4_set_cq_in_error(struct t4_cq *cq)
 
 struct t4_dev_status_page {
        u8 db_off;
+       u8 pad1;
+       u16 pad2;
+       u32 pad3;
+       u64 qp_start;
+       u64 qp_size;
+       u64 cq_start;
+       u64 cq_size;
 };
index cbd0ce1707282a63fe1acb10c223c4af22e28ad1..295f422b9a3ab4caa829182712dcab6bc19a504d 100644 (file)
@@ -32,7 +32,7 @@
 #ifndef __C4IW_USER_H__
 #define __C4IW_USER_H__
 
-#define C4IW_UVERBS_ABI_VERSION        2
+#define C4IW_UVERBS_ABI_VERSION        3
 
 /*
  * Make sure that all structs defined in this file remain laid out so
index 86af71351d9a5be6cbdd87f3d22ccb7078c82b23..105246fba2e7c381958c2e5e74b3a4af7b1441a8 100644 (file)
@@ -92,7 +92,7 @@ static struct ib_ah *create_iboe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr
                                ah_attr->grh.sgid_index, &sgid, &gid_attr);
        if (ret)
                return ERR_PTR(ret);
-       memset(ah->av.eth.s_mac, 0, ETH_ALEN);
+       eth_zero_addr(ah->av.eth.s_mac);
        if (gid_attr.ndev) {
                if (is_vlan_dev(gid_attr.ndev))
                        vlan_tag = vlan_dev_vlan_id(gid_attr.ndev);
@@ -104,6 +104,7 @@ static struct ib_ah *create_iboe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr
        ah->av.eth.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24));
        ah->av.eth.gid_index = mlx4_ib_gid_index_to_real_index(ibdev, ah_attr->port_num, ah_attr->grh.sgid_index);
        ah->av.eth.vlan = cpu_to_be16(vlan_tag);
+       ah->av.eth.hop_limit = ah_attr->grh.hop_limit;
        if (ah_attr->static_rate) {
                ah->av.eth.stat_rate = ah_attr->static_rate + MLX4_STAT_RATE_OFFSET;
                while (ah->av.eth.stat_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET &&
index b88fc8f5ab180c717e827f582187b817f92a0e33..9f8b516eb2b0f2412452685d21df360075245f0c 100644 (file)
@@ -811,9 +811,6 @@ repoll:
                        wc->opcode    = IB_WC_MASKED_FETCH_ADD;
                        wc->byte_len  = 8;
                        break;
-               case MLX4_OPCODE_BIND_MW:
-                       wc->opcode    = IB_WC_BIND_MW;
-                       break;
                case MLX4_OPCODE_LSO:
                        wc->opcode    = IB_WC_LSO;
                        break;
index 97d6878f993828085a0f26f941572674ce68b967..1c7ab6cabbb86989fba8952b8f5e6be645e313dc 100644 (file)
@@ -154,9 +154,9 @@ static struct net_device *mlx4_ib_get_netdev(struct ib_device *device, u8 port_n
        return dev;
 }
 
-static int mlx4_ib_update_gids(struct gid_entry *gids,
-                              struct mlx4_ib_dev *ibdev,
-                              u8 port_num)
+static int mlx4_ib_update_gids_v1(struct gid_entry *gids,
+                                 struct mlx4_ib_dev *ibdev,
+                                 u8 port_num)
 {
        struct mlx4_cmd_mailbox *mailbox;
        int err;
@@ -187,6 +187,63 @@ static int mlx4_ib_update_gids(struct gid_entry *gids,
        return err;
 }
 
+static int mlx4_ib_update_gids_v1_v2(struct gid_entry *gids,
+                                    struct mlx4_ib_dev *ibdev,
+                                    u8 port_num)
+{
+       struct mlx4_cmd_mailbox *mailbox;
+       int err;
+       struct mlx4_dev *dev = ibdev->dev;
+       int i;
+       struct {
+               union ib_gid    gid;
+               __be32          rsrvd1[2];
+               __be16          rsrvd2;
+               u8              type;
+               u8              version;
+               __be32          rsrvd3;
+       } *gid_tbl;
+
+       mailbox = mlx4_alloc_cmd_mailbox(dev);
+       if (IS_ERR(mailbox))
+               return -ENOMEM;
+
+       gid_tbl = mailbox->buf;
+       for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i) {
+               memcpy(&gid_tbl[i].gid, &gids[i].gid, sizeof(union ib_gid));
+               if (gids[i].gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) {
+                       gid_tbl[i].version = 2;
+                       if (!ipv6_addr_v4mapped((struct in6_addr *)&gids[i].gid))
+                               gid_tbl[i].type = 1;
+                       else
+                               memset(&gid_tbl[i].gid, 0, 12);
+               }
+       }
+
+       err = mlx4_cmd(dev, mailbox->dma,
+                      MLX4_SET_PORT_ROCE_ADDR << 8 | port_num,
+                      1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+                      MLX4_CMD_WRAPPED);
+       if (mlx4_is_bonded(dev))
+               err += mlx4_cmd(dev, mailbox->dma,
+                               MLX4_SET_PORT_ROCE_ADDR << 8 | 2,
+                               1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+                               MLX4_CMD_WRAPPED);
+
+       mlx4_free_cmd_mailbox(dev, mailbox);
+       return err;
+}
+
+static int mlx4_ib_update_gids(struct gid_entry *gids,
+                              struct mlx4_ib_dev *ibdev,
+                              u8 port_num)
+{
+       if (ibdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2)
+               return mlx4_ib_update_gids_v1_v2(gids, ibdev, port_num);
+
+       return mlx4_ib_update_gids_v1(gids, ibdev, port_num);
+}
+
 static int mlx4_ib_add_gid(struct ib_device *device,
                           u8 port_num,
                           unsigned int index,
@@ -215,7 +272,8 @@ static int mlx4_ib_add_gid(struct ib_device *device,
        port_gid_table = &iboe->gids[port_num - 1];
        spin_lock_bh(&iboe->lock);
        for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i) {
-               if (!memcmp(&port_gid_table->gids[i].gid, gid, sizeof(*gid))) {
+               if (!memcmp(&port_gid_table->gids[i].gid, gid, sizeof(*gid)) &&
+                   (port_gid_table->gids[i].gid_type == attr->gid_type))  {
                        found = i;
                        break;
                }
@@ -233,6 +291,7 @@ static int mlx4_ib_add_gid(struct ib_device *device,
                        } else {
                                *context = port_gid_table->gids[free].ctx;
                                memcpy(&port_gid_table->gids[free].gid, gid, sizeof(*gid));
+                               port_gid_table->gids[free].gid_type = attr->gid_type;
                                port_gid_table->gids[free].ctx->real_index = free;
                                port_gid_table->gids[free].ctx->refcount = 1;
                                hw_update = 1;
@@ -248,8 +307,10 @@ static int mlx4_ib_add_gid(struct ib_device *device,
                if (!gids) {
                        ret = -ENOMEM;
                } else {
-                       for (i = 0; i < MLX4_MAX_PORT_GIDS; i++)
+                       for (i = 0; i < MLX4_MAX_PORT_GIDS; i++) {
                                memcpy(&gids[i].gid, &port_gid_table->gids[i].gid, sizeof(union ib_gid));
+                               gids[i].gid_type = port_gid_table->gids[i].gid_type;
+                       }
                }
        }
        spin_unlock_bh(&iboe->lock);
@@ -325,6 +386,7 @@ int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev,
        int i;
        int ret;
        unsigned long flags;
+       struct ib_gid_attr attr;
 
        if (port_num > MLX4_MAX_PORTS)
                return -EINVAL;
@@ -335,10 +397,13 @@ int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev,
        if (!rdma_cap_roce_gid_table(&ibdev->ib_dev, port_num))
                return index;
 
-       ret = ib_get_cached_gid(&ibdev->ib_dev, port_num, index, &gid, NULL);
+       ret = ib_get_cached_gid(&ibdev->ib_dev, port_num, index, &gid, &attr);
        if (ret)
                return ret;
 
+       if (attr.ndev)
+               dev_put(attr.ndev);
+
        if (!memcmp(&gid, &zgid, sizeof(gid)))
                return -EINVAL;
 
@@ -346,7 +411,8 @@ int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev,
        port_gid_table = &iboe->gids[port_num - 1];
 
        for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i)
-               if (!memcmp(&port_gid_table->gids[i].gid, &gid, sizeof(gid))) {
+               if (!memcmp(&port_gid_table->gids[i].gid, &gid, sizeof(gid)) &&
+                   attr.gid_type == port_gid_table->gids[i].gid_type) {
                        ctx = port_gid_table->gids[i].ctx;
                        break;
                }
@@ -2119,6 +2185,7 @@ static int mlx4_port_immutable(struct ib_device *ibdev, u8 port_num,
                               struct ib_port_immutable *immutable)
 {
        struct ib_port_attr attr;
+       struct mlx4_ib_dev *mdev = to_mdev(ibdev);
        int err;
 
        err = mlx4_ib_query_port(ibdev, port_num, &attr);
@@ -2128,10 +2195,15 @@ static int mlx4_port_immutable(struct ib_device *ibdev, u8 port_num,
        immutable->pkey_tbl_len = attr.pkey_tbl_len;
        immutable->gid_tbl_len = attr.gid_tbl_len;
 
-       if (mlx4_ib_port_link_layer(ibdev, port_num) == IB_LINK_LAYER_INFINIBAND)
+       if (mlx4_ib_port_link_layer(ibdev, port_num) == IB_LINK_LAYER_INFINIBAND) {
                immutable->core_cap_flags = RDMA_CORE_PORT_IBA_IB;
-       else
-               immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE;
+       } else {
+               if (mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE)
+                       immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE;
+               if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2)
+                       immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE |
+                               RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
+       }
 
        immutable->max_mad_size = IB_MGMT_MAD_SIZE;
 
@@ -2283,7 +2355,6 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
        if (dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW ||
            dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) {
                ibdev->ib_dev.alloc_mw = mlx4_ib_alloc_mw;
-               ibdev->ib_dev.bind_mw = mlx4_ib_bind_mw;
                ibdev->ib_dev.dealloc_mw = mlx4_ib_dealloc_mw;
 
                ibdev->ib_dev.uverbs_cmd_mask |=
@@ -2423,7 +2494,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
        if (mlx4_ib_init_sriov(ibdev))
                goto err_mad;
 
-       if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE) {
+       if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE ||
+           dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) {
                if (!iboe->nb.notifier_call) {
                        iboe->nb.notifier_call = mlx4_ib_netdev_event;
                        err = register_netdevice_notifier(&iboe->nb);
@@ -2432,6 +2504,12 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
                                goto err_notif;
                        }
                }
+               if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) {
+                       err = mlx4_config_roce_v2_port(dev, ROCE_V2_UDP_DPORT);
+                       if (err) {
+                               goto err_notif;
+                       }
+               }
        }
 
        for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) {
index 1caa11edac03347a246467436daa72dd18d5c505..52ce7b000044f6e0814b59b08e1ea27a696e77d8 100644 (file)
@@ -177,11 +177,18 @@ struct mlx4_ib_wq {
        unsigned                tail;
 };
 
+enum {
+       MLX4_IB_QP_CREATE_ROCE_V2_GSI = IB_QP_CREATE_RESERVED_START
+};
+
 enum mlx4_ib_qp_flags {
        MLX4_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO,
        MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK,
        MLX4_IB_QP_NETIF = IB_QP_CREATE_NETIF_QP,
        MLX4_IB_QP_CREATE_USE_GFP_NOIO = IB_QP_CREATE_USE_GFP_NOIO,
+
+       /* Mellanox specific flags start from IB_QP_CREATE_RESERVED_START */
+       MLX4_IB_ROCE_V2_GSI_QP = MLX4_IB_QP_CREATE_ROCE_V2_GSI,
        MLX4_IB_SRIOV_TUNNEL_QP = 1 << 30,
        MLX4_IB_SRIOV_SQP = 1 << 31,
 };
@@ -478,6 +485,7 @@ struct gid_cache_context {
 
 struct gid_entry {
        union ib_gid    gid;
+       enum ib_gid_type gid_type;
        struct gid_cache_context *ctx;
 };
 
@@ -704,8 +712,6 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
                                  struct ib_udata *udata);
 int mlx4_ib_dereg_mr(struct ib_mr *mr);
 struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type);
-int mlx4_ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw,
-                   struct ib_mw_bind *mw_bind);
 int mlx4_ib_dealloc_mw(struct ib_mw *mw);
 struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd,
                               enum ib_mr_type mr_type,
index 4d1e1c632603a7b81e6685cd5ad6020f98a02a47..242b94ec105babe092ba3fee42c931ee24518adc 100644 (file)
@@ -366,28 +366,6 @@ err_free:
        return ERR_PTR(err);
 }
 
-int mlx4_ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw,
-                   struct ib_mw_bind *mw_bind)
-{
-       struct ib_bind_mw_wr  wr;
-       struct ib_send_wr *bad_wr;
-       int ret;
-
-       memset(&wr, 0, sizeof(wr));
-       wr.wr.opcode            = IB_WR_BIND_MW;
-       wr.wr.wr_id             = mw_bind->wr_id;
-       wr.wr.send_flags        = mw_bind->send_flags;
-       wr.mw                   = mw;
-       wr.bind_info            = mw_bind->bind_info;
-       wr.rkey                 = ib_inc_rkey(mw->rkey);
-
-       ret = mlx4_ib_post_send(qp, &wr.wr, &bad_wr);
-       if (!ret)
-               mw->rkey = wr.rkey;
-
-       return ret;
-}
-
 int mlx4_ib_dealloc_mw(struct ib_mw *ibmw)
 {
        struct mlx4_ib_mw *mw = to_mmw(ibmw);
index 13eaaf45288f80d4bb6658d5a18985a978e8c98d..bc5536f00b6cd4cc148f65b2c6131d7e627544eb 100644 (file)
@@ -32,6 +32,8 @@
  */
 
 #include <linux/log2.h>
+#include <linux/etherdevice.h>
+#include <net/ip.h>
 #include <linux/slab.h>
 #include <linux/netdevice.h>
 #include <linux/vmalloc.h>
@@ -85,6 +87,7 @@ struct mlx4_ib_sqp {
        u32                     send_psn;
        struct ib_ud_header     ud_header;
        u8                      header_buf[MLX4_IB_UD_HEADER_SIZE];
+       struct ib_qp            *roce_v2_gsi;
 };
 
 enum {
@@ -115,7 +118,6 @@ static const __be32 mlx4_ib_opcode[] = {
        [IB_WR_REG_MR]                          = cpu_to_be32(MLX4_OPCODE_FMR),
        [IB_WR_MASKED_ATOMIC_CMP_AND_SWP]       = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_CS),
        [IB_WR_MASKED_ATOMIC_FETCH_AND_ADD]     = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_FA),
-       [IB_WR_BIND_MW]                         = cpu_to_be32(MLX4_OPCODE_BIND_MW),
 };
 
 static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp)
@@ -154,7 +156,10 @@ static int is_sqp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
                        }
                }
        }
-       return proxy_sqp;
+       if (proxy_sqp)
+               return 1;
+
+       return !!(qp->flags & MLX4_IB_ROCE_V2_GSI_QP);
 }
 
 /* used for INIT/CLOSE port logic */
@@ -796,11 +801,13 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
                if (err)
                        goto err_mtt;
 
-               qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof(u64), gfp);
+               qp->sq.wrid = kmalloc_array(qp->sq.wqe_cnt, sizeof(u64),
+                                       gfp | __GFP_NOWARN);
                if (!qp->sq.wrid)
                        qp->sq.wrid = __vmalloc(qp->sq.wqe_cnt * sizeof(u64),
                                                gfp, PAGE_KERNEL);
-               qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof(u64), gfp);
+               qp->rq.wrid = kmalloc_array(qp->rq.wqe_cnt, sizeof(u64),
+                                       gfp | __GFP_NOWARN);
                if (!qp->rq.wrid)
                        qp->rq.wrid = __vmalloc(qp->rq.wqe_cnt * sizeof(u64),
                                                gfp, PAGE_KERNEL);
@@ -1099,9 +1106,9 @@ static u32 get_sqp_num(struct mlx4_ib_dev *dev, struct ib_qp_init_attr *attr)
                return dev->dev->caps.qp1_proxy[attr->port_num - 1];
 }
 
-struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
-                               struct ib_qp_init_attr *init_attr,
-                               struct ib_udata *udata)
+static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd,
+                                       struct ib_qp_init_attr *init_attr,
+                                       struct ib_udata *udata)
 {
        struct mlx4_ib_qp *qp = NULL;
        int err;
@@ -1120,6 +1127,7 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
                                        MLX4_IB_SRIOV_TUNNEL_QP |
                                        MLX4_IB_SRIOV_SQP |
                                        MLX4_IB_QP_NETIF |
+                                       MLX4_IB_QP_CREATE_ROCE_V2_GSI |
                                        MLX4_IB_QP_CREATE_USE_GFP_NOIO))
                return ERR_PTR(-EINVAL);
 
@@ -1128,15 +1136,21 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
                        return ERR_PTR(-EINVAL);
        }
 
-       if (init_attr->create_flags &&
-           ((udata && init_attr->create_flags & ~(sup_u_create_flags)) ||
-            ((init_attr->create_flags & ~(MLX4_IB_SRIOV_SQP |
-                                          MLX4_IB_QP_CREATE_USE_GFP_NOIO |
-                                          MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK)) &&
-             init_attr->qp_type != IB_QPT_UD) ||
-            ((init_attr->create_flags & MLX4_IB_SRIOV_SQP) &&
-             init_attr->qp_type > IB_QPT_GSI)))
-               return ERR_PTR(-EINVAL);
+       if (init_attr->create_flags) {
+               if (udata && init_attr->create_flags & ~(sup_u_create_flags))
+                       return ERR_PTR(-EINVAL);
+
+               if ((init_attr->create_flags & ~(MLX4_IB_SRIOV_SQP |
+                                                MLX4_IB_QP_CREATE_USE_GFP_NOIO |
+                                                MLX4_IB_QP_CREATE_ROCE_V2_GSI  |
+                                                MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK) &&
+                    init_attr->qp_type != IB_QPT_UD) ||
+                   (init_attr->create_flags & MLX4_IB_SRIOV_SQP &&
+                    init_attr->qp_type > IB_QPT_GSI) ||
+                   (init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI &&
+                    init_attr->qp_type != IB_QPT_GSI))
+                       return ERR_PTR(-EINVAL);
+       }
 
        switch (init_attr->qp_type) {
        case IB_QPT_XRC_TGT:
@@ -1173,19 +1187,29 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
        case IB_QPT_SMI:
        case IB_QPT_GSI:
        {
+               int sqpn;
+
                /* Userspace is not allowed to create special QPs: */
                if (udata)
                        return ERR_PTR(-EINVAL);
+               if (init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI) {
+                       int res = mlx4_qp_reserve_range(to_mdev(pd->device)->dev, 1, 1, &sqpn, 0);
+
+                       if (res)
+                               return ERR_PTR(res);
+               } else {
+                       sqpn = get_sqp_num(to_mdev(pd->device), init_attr);
+               }
 
                err = create_qp_common(to_mdev(pd->device), pd, init_attr, udata,
-                                      get_sqp_num(to_mdev(pd->device), init_attr),
+                                      sqpn,
                                       &qp, gfp);
                if (err)
                        return ERR_PTR(err);
 
                qp->port        = init_attr->port_num;
-               qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1;
-
+               qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 :
+                       init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI ? sqpn : 1;
                break;
        }
        default:
@@ -1196,7 +1220,41 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
        return &qp->ibqp;
 }
 
-int mlx4_ib_destroy_qp(struct ib_qp *qp)
+struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
+                               struct ib_qp_init_attr *init_attr,
+                               struct ib_udata *udata) {
+       struct ib_device *device = pd ? pd->device : init_attr->xrcd->device;
+       struct ib_qp *ibqp;
+       struct mlx4_ib_dev *dev = to_mdev(device);
+
+       ibqp = _mlx4_ib_create_qp(pd, init_attr, udata);
+
+       if (!IS_ERR(ibqp) &&
+           (init_attr->qp_type == IB_QPT_GSI) &&
+           !(init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI)) {
+               struct mlx4_ib_sqp *sqp = to_msqp((to_mqp(ibqp)));
+               int is_eth = rdma_cap_eth_ah(&dev->ib_dev, init_attr->port_num);
+
+               if (is_eth &&
+                   dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) {
+                       init_attr->create_flags |= MLX4_IB_QP_CREATE_ROCE_V2_GSI;
+                       sqp->roce_v2_gsi = ib_create_qp(pd, init_attr);
+
+                       if (IS_ERR(sqp->roce_v2_gsi)) {
+                               pr_err("Failed to create GSI QP for RoCEv2 (%ld)\n", PTR_ERR(sqp->roce_v2_gsi));
+                               sqp->roce_v2_gsi = NULL;
+                       } else {
+                               sqp = to_msqp(to_mqp(sqp->roce_v2_gsi));
+                               sqp->qp.flags |= MLX4_IB_ROCE_V2_GSI_QP;
+                       }
+
+                       init_attr->create_flags &= ~MLX4_IB_QP_CREATE_ROCE_V2_GSI;
+               }
+       }
+       return ibqp;
+}
+
+static int _mlx4_ib_destroy_qp(struct ib_qp *qp)
 {
        struct mlx4_ib_dev *dev = to_mdev(qp->device);
        struct mlx4_ib_qp *mqp = to_mqp(qp);
@@ -1225,6 +1283,20 @@ int mlx4_ib_destroy_qp(struct ib_qp *qp)
        return 0;
 }
 
+int mlx4_ib_destroy_qp(struct ib_qp *qp)
+{
+       struct mlx4_ib_qp *mqp = to_mqp(qp);
+
+       if (mqp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) {
+               struct mlx4_ib_sqp *sqp = to_msqp(mqp);
+
+               if (sqp->roce_v2_gsi)
+                       ib_destroy_qp(sqp->roce_v2_gsi);
+       }
+
+       return _mlx4_ib_destroy_qp(qp);
+}
+
 static int to_mlx4_st(struct mlx4_ib_dev *dev, enum mlx4_ib_qp_type type)
 {
        switch (type) {
@@ -1507,6 +1579,24 @@ static int create_qp_lb_counter(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
        return 0;
 }
 
+enum {
+       MLX4_QPC_ROCE_MODE_1 = 0,
+       MLX4_QPC_ROCE_MODE_2 = 2,
+       MLX4_QPC_ROCE_MODE_UNDEFINED = 0xff
+};
+
+static u8 gid_type_to_qpc(enum ib_gid_type gid_type)
+{
+       switch (gid_type) {
+       case IB_GID_TYPE_ROCE:
+               return MLX4_QPC_ROCE_MODE_1;
+       case IB_GID_TYPE_ROCE_UDP_ENCAP:
+               return MLX4_QPC_ROCE_MODE_2;
+       default:
+               return MLX4_QPC_ROCE_MODE_UNDEFINED;
+       }
+}
+
 static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
                               const struct ib_qp_attr *attr, int attr_mask,
                               enum ib_qp_state cur_state, enum ib_qp_state new_state)
@@ -1633,6 +1723,14 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
                        mlx4_ib_steer_qp_reg(dev, qp, 1);
                        steer_qp = 1;
                }
+
+               if (ibqp->qp_type == IB_QPT_GSI) {
+                       enum ib_gid_type gid_type = qp->flags & MLX4_IB_ROCE_V2_GSI_QP ?
+                               IB_GID_TYPE_ROCE_UDP_ENCAP : IB_GID_TYPE_ROCE;
+                       u8 qpc_roce_mode = gid_type_to_qpc(gid_type);
+
+                       context->rlkey_roce_mode |= (qpc_roce_mode << 6);
+               }
        }
 
        if (attr_mask & IB_QP_PKEY_INDEX) {
@@ -1650,9 +1748,10 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
                u16 vlan = 0xffff;
                u8 smac[ETH_ALEN];
                int status = 0;
+               int is_eth = rdma_cap_eth_ah(&dev->ib_dev, port_num) &&
+                       attr->ah_attr.ah_flags & IB_AH_GRH;
 
-               if (rdma_cap_eth_ah(&dev->ib_dev, port_num) &&
-                   attr->ah_attr.ah_flags & IB_AH_GRH) {
+               if (is_eth) {
                        int index = attr->ah_attr.grh.sgid_index;
 
                        status = ib_get_cached_gid(ibqp->device, port_num,
@@ -1674,6 +1773,18 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 
                optpar |= (MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH |
                           MLX4_QP_OPTPAR_SCHED_QUEUE);
+
+               if (is_eth &&
+                   (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR)) {
+                       u8 qpc_roce_mode = gid_type_to_qpc(gid_attr.gid_type);
+
+                       if (qpc_roce_mode == MLX4_QPC_ROCE_MODE_UNDEFINED) {
+                               err = -EINVAL;
+                               goto out;
+                       }
+                       context->rlkey_roce_mode |= (qpc_roce_mode << 6);
+               }
+
        }
 
        if (attr_mask & IB_QP_TIMEOUT) {
@@ -1845,7 +1956,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
                sqd_event = 0;
 
        if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
-               context->rlkey |= (1 << 4);
+               context->rlkey_roce_mode |= (1 << 4);
 
        /*
         * Before passing a kernel QP to the HW, make sure that the
@@ -2022,8 +2133,8 @@ out:
        return err;
 }
 
-int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
-                     int attr_mask, struct ib_udata *udata)
+static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+                             int attr_mask, struct ib_udata *udata)
 {
        struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
        struct mlx4_ib_qp *qp = to_mqp(ibqp);
@@ -2126,6 +2237,27 @@ out:
        return err;
 }
 
+int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+                     int attr_mask, struct ib_udata *udata)
+{
+       struct mlx4_ib_qp *mqp = to_mqp(ibqp);
+       int ret;
+
+       ret = _mlx4_ib_modify_qp(ibqp, attr, attr_mask, udata);
+
+       if (mqp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) {
+               struct mlx4_ib_sqp *sqp = to_msqp(mqp);
+               int err = 0;
+
+               if (sqp->roce_v2_gsi)
+                       err = ib_modify_qp(sqp->roce_v2_gsi, attr, attr_mask);
+               if (err)
+                       pr_err("Failed to modify GSI QP for RoCEv2 (%d)\n",
+                              err);
+       }
+       return ret;
+}
+
 static int vf_get_qp0_qkey(struct mlx4_dev *dev, int qpn, u32 *qkey)
 {
        int i;
@@ -2168,7 +2300,7 @@ static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp,
        if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER)
                send_size += sizeof (struct mlx4_ib_tunnel_header);
 
-       ib_ud_header_init(send_size, 1, 0, 0, 0, 0, &sqp->ud_header);
+       ib_ud_header_init(send_size, 1, 0, 0, 0, 0, 0, 0, &sqp->ud_header);
 
        if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER) {
                sqp->ud_header.lrh.service_level =
@@ -2252,16 +2384,7 @@ static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp,
        return 0;
 }
 
-static void mlx4_u64_to_smac(u8 *dst_mac, u64 src_mac)
-{
-       int i;
-
-       for (i = ETH_ALEN; i; i--) {
-               dst_mac[i - 1] = src_mac & 0xff;
-               src_mac >>= 8;
-       }
-}
-
+#define MLX4_ROCEV2_QP1_SPORT 0xC000
 static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_ud_wr *wr,
                            void *wqe, unsigned *mlx_seg_len)
 {
@@ -2281,6 +2404,8 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_ud_wr *wr,
        bool is_eth;
        bool is_vlan = false;
        bool is_grh;
+       bool is_udp = false;
+       int ip_version = 0;
 
        send_size = 0;
        for (i = 0; i < wr->wr.num_sge; ++i)
@@ -2289,6 +2414,8 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_ud_wr *wr,
        is_eth = rdma_port_get_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET;
        is_grh = mlx4_ib_ah_grh_present(ah);
        if (is_eth) {
+               struct ib_gid_attr gid_attr;
+
                if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
                        /* When multi-function is enabled, the ib_core gid
                         * indexes don't necessarily match the hw ones, so
@@ -2302,19 +2429,35 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_ud_wr *wr,
                        err = ib_get_cached_gid(ib_dev,
                                                be32_to_cpu(ah->av.ib.port_pd) >> 24,
                                                ah->av.ib.gid_index, &sgid,
-                                               NULL);
-                       if (!err && !memcmp(&sgid, &zgid, sizeof(sgid)))
-                               err = -ENOENT;
-                       if (err)
+                                               &gid_attr);
+                       if (!err) {
+                               if (gid_attr.ndev)
+                                       dev_put(gid_attr.ndev);
+                               if (!memcmp(&sgid, &zgid, sizeof(sgid)))
+                                       err = -ENOENT;
+                       }
+                       if (!err) {
+                               is_udp = gid_attr.gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP;
+                               if (is_udp) {
+                                       if (ipv6_addr_v4mapped((struct in6_addr *)&sgid))
+                                               ip_version = 4;
+                                       else
+                                               ip_version = 6;
+                                       is_grh = false;
+                               }
+                       } else {
                                return err;
+                       }
                }
-
                if (ah->av.eth.vlan != cpu_to_be16(0xffff)) {
                        vlan = be16_to_cpu(ah->av.eth.vlan) & 0x0fff;
                        is_vlan = 1;
                }
        }
-       ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, 0, &sqp->ud_header);
+       err = ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh,
+                         ip_version, is_udp, 0, &sqp->ud_header);
+       if (err)
+               return err;
 
        if (!is_eth) {
                sqp->ud_header.lrh.service_level =
@@ -2323,7 +2466,7 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_ud_wr *wr,
                sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f);
        }
 
-       if (is_grh) {
+       if (is_grh || (ip_version == 6)) {
                sqp->ud_header.grh.traffic_class =
                        (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff;
                sqp->ud_header.grh.flow_label    =
@@ -2352,6 +2495,25 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_ud_wr *wr,
                       ah->av.ib.dgid, 16);
        }
 
+       if (ip_version == 4) {
+               sqp->ud_header.ip4.tos =
+                       (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff;
+               sqp->ud_header.ip4.id = 0;
+               sqp->ud_header.ip4.frag_off = htons(IP_DF);
+               sqp->ud_header.ip4.ttl = ah->av.eth.hop_limit;
+
+               memcpy(&sqp->ud_header.ip4.saddr,
+                      sgid.raw + 12, 4);
+               memcpy(&sqp->ud_header.ip4.daddr, ah->av.ib.dgid + 12, 4);
+               sqp->ud_header.ip4.check = ib_ud_ip4_csum(&sqp->ud_header);
+       }
+
+       if (is_udp) {
+               sqp->ud_header.udp.dport = htons(ROCE_V2_UDP_DPORT);
+               sqp->ud_header.udp.sport = htons(MLX4_ROCEV2_QP1_SPORT);
+               sqp->ud_header.udp.csum = 0;
+       }
+
        mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
 
        if (!is_eth) {
@@ -2380,34 +2542,27 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_ud_wr *wr,
 
        if (is_eth) {
                struct in6_addr in6;
-
+               u16 ether_type;
                u16 pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 29) << 13;
 
+               ether_type = (!is_udp) ? MLX4_IB_IBOE_ETHERTYPE :
+                       (ip_version == 4 ? ETH_P_IP : ETH_P_IPV6);
+
                mlx->sched_prio = cpu_to_be16(pcp);
 
+               ether_addr_copy(sqp->ud_header.eth.smac_h, ah->av.eth.s_mac);
                memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6);
-               /* FIXME: cache smac value? */
                memcpy(&ctrl->srcrb_flags16[0], ah->av.eth.mac, 2);
                memcpy(&ctrl->imm, ah->av.eth.mac + 2, 4);
                memcpy(&in6, sgid.raw, sizeof(in6));
 
-               if (!mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
-                       u64 mac = atomic64_read(&to_mdev(ib_dev)->iboe.mac[sqp->qp.port - 1]);
-                       u8 smac[ETH_ALEN];
-
-                       mlx4_u64_to_smac(smac, mac);
-                       memcpy(sqp->ud_header.eth.smac_h, smac, ETH_ALEN);
-               } else {
-                       /* use the src mac of the tunnel */
-                       memcpy(sqp->ud_header.eth.smac_h, ah->av.eth.s_mac, ETH_ALEN);
-               }
 
                if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6))
                        mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK);
                if (!is_vlan) {
-                       sqp->ud_header.eth.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE);
+                       sqp->ud_header.eth.type = cpu_to_be16(ether_type);
                } else {
-                       sqp->ud_header.vlan.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE);
+                       sqp->ud_header.vlan.type = cpu_to_be16(ether_type);
                        sqp->ud_header.vlan.tag = cpu_to_be16(vlan | pcp);
                }
        } else {
@@ -2528,25 +2683,6 @@ static void set_reg_seg(struct mlx4_wqe_fmr_seg *fseg,
        fseg->reserved[1]       = 0;
 }
 
-static void set_bind_seg(struct mlx4_wqe_bind_seg *bseg,
-               struct ib_bind_mw_wr *wr)
-{
-       bseg->flags1 =
-               convert_access(wr->bind_info.mw_access_flags) &
-               cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_READ  |
-                           MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_WRITE |
-                           MLX4_WQE_FMR_AND_BIND_PERM_ATOMIC);
-       bseg->flags2 = 0;
-       if (wr->mw->type == IB_MW_TYPE_2)
-               bseg->flags2 |= cpu_to_be32(MLX4_WQE_BIND_TYPE_2);
-       if (wr->bind_info.mw_access_flags & IB_ZERO_BASED)
-               bseg->flags2 |= cpu_to_be32(MLX4_WQE_BIND_ZERO_BASED);
-       bseg->new_rkey = cpu_to_be32(wr->rkey);
-       bseg->lkey = cpu_to_be32(wr->bind_info.mr->lkey);
-       bseg->addr = cpu_to_be64(wr->bind_info.addr);
-       bseg->length = cpu_to_be64(wr->bind_info.length);
-}
-
 static void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, u32 rkey)
 {
        memset(iseg, 0, sizeof(*iseg));
@@ -2766,6 +2902,29 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
        int i;
        struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
 
+       if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) {
+               struct mlx4_ib_sqp *sqp = to_msqp(qp);
+
+               if (sqp->roce_v2_gsi) {
+                       struct mlx4_ib_ah *ah = to_mah(ud_wr(wr)->ah);
+                       struct ib_gid_attr gid_attr;
+                       union ib_gid gid;
+
+                       if (!ib_get_cached_gid(ibqp->device,
+                                              be32_to_cpu(ah->av.ib.port_pd) >> 24,
+                                              ah->av.ib.gid_index, &gid,
+                                              &gid_attr)) {
+                               if (gid_attr.ndev)
+                                       dev_put(gid_attr.ndev);
+                               qp = (gid_attr.gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ?
+                                       to_mqp(sqp->roce_v2_gsi) : qp;
+                       } else {
+                               pr_err("Failed to get gid at index %d. RoCEv2 will not work properly\n",
+                                      ah->av.ib.gid_index);
+                       }
+               }
+       }
+
        spin_lock_irqsave(&qp->sq.lock, flags);
        if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
                err = -EIO;
@@ -2867,13 +3026,6 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
                                size += sizeof(struct mlx4_wqe_fmr_seg) / 16;
                                break;
 
-                       case IB_WR_BIND_MW:
-                               ctrl->srcrb_flags |=
-                                       cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER);
-                               set_bind_seg(wqe, bind_mw_wr(wr));
-                               wqe  += sizeof(struct mlx4_wqe_bind_seg);
-                               size += sizeof(struct mlx4_wqe_bind_seg) / 16;
-                               break;
                        default:
                                /* No extra segments required for sends */
                                break;
index c394376ebe06f159aebca07ef899ef5933386713..0597f3eef5d03ce4729c0fc0f5f0ffd076ae1176 100644 (file)
@@ -171,7 +171,8 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
                if (err)
                        goto err_mtt;
 
-               srq->wrid = kmalloc(srq->msrq.max * sizeof (u64), GFP_KERNEL);
+               srq->wrid = kmalloc_array(srq->msrq.max, sizeof(u64),
+                                       GFP_KERNEL | __GFP_NOWARN);
                if (!srq->wrid) {
                        srq->wrid = __vmalloc(srq->msrq.max * sizeof(u64),
                                              GFP_KERNEL, PAGE_KERNEL);
index 66080580e24db3e90c887bcea7132e6687bc3d33..745efa4cfc718eec9b749508f54b091ed13332d3 100644 (file)
 
 #include "mlx5_ib.h"
 
-struct ib_ah *create_ib_ah(struct ib_ah_attr *ah_attr,
-                          struct mlx5_ib_ah *ah)
+static struct ib_ah *create_ib_ah(struct mlx5_ib_dev *dev,
+                                 struct mlx5_ib_ah *ah,
+                                 struct ib_ah_attr *ah_attr,
+                                 enum rdma_link_layer ll)
 {
        if (ah_attr->ah_flags & IB_AH_GRH) {
                memcpy(ah->av.rgid, &ah_attr->grh.dgid, 16);
@@ -44,9 +46,20 @@ struct ib_ah *create_ib_ah(struct ib_ah_attr *ah_attr,
                ah->av.tclass = ah_attr->grh.traffic_class;
        }
 
-       ah->av.rlid = cpu_to_be16(ah_attr->dlid);
-       ah->av.fl_mlid = ah_attr->src_path_bits & 0x7f;
-       ah->av.stat_rate_sl = (ah_attr->static_rate << 4) | (ah_attr->sl & 0xf);
+       ah->av.stat_rate_sl = (ah_attr->static_rate << 4);
+
+       if (ll == IB_LINK_LAYER_ETHERNET) {
+               memcpy(ah->av.rmac, ah_attr->dmac, sizeof(ah_attr->dmac));
+               ah->av.udp_sport =
+                       mlx5_get_roce_udp_sport(dev,
+                                               ah_attr->port_num,
+                                               ah_attr->grh.sgid_index);
+               ah->av.stat_rate_sl |= (ah_attr->sl & 0x7) << 1;
+       } else {
+               ah->av.rlid = cpu_to_be16(ah_attr->dlid);
+               ah->av.fl_mlid = ah_attr->src_path_bits & 0x7f;
+               ah->av.stat_rate_sl |= (ah_attr->sl & 0xf);
+       }
 
        return &ah->ibah;
 }
@@ -54,12 +67,19 @@ struct ib_ah *create_ib_ah(struct ib_ah_attr *ah_attr,
 struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
 {
        struct mlx5_ib_ah *ah;
+       struct mlx5_ib_dev *dev = to_mdev(pd->device);
+       enum rdma_link_layer ll;
+
+       ll = pd->device->get_link_layer(pd->device, ah_attr->port_num);
+
+       if (ll == IB_LINK_LAYER_ETHERNET && !(ah_attr->ah_flags & IB_AH_GRH))
+               return ERR_PTR(-EINVAL);
 
        ah = kzalloc(sizeof(*ah), GFP_ATOMIC);
        if (!ah)
                return ERR_PTR(-ENOMEM);
 
-       return create_ib_ah(ah_attr, ah); /* never fails */
+       return create_ib_ah(dev, ah, ah_attr, ll); /* never fails */
 }
 
 int mlx5_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
index 92ddae101ecc7dc6032705e56a2c75d46fd0bf71..fd1de31e0611cbbd3d8691e595677d8e182f3198 100644 (file)
@@ -154,9 +154,6 @@ static void handle_good_req(struct ib_wc *wc, struct mlx5_cqe64 *cqe,
                wc->opcode    = IB_WC_MASKED_FETCH_ADD;
                wc->byte_len  = 8;
                break;
-       case MLX5_OPCODE_BIND_MW:
-               wc->opcode    = IB_WC_BIND_MW;
-               break;
        case MLX5_OPCODE_UMR:
                wc->opcode = get_umr_comp(wq, idx);
                break;
@@ -171,6 +168,7 @@ enum {
 static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe,
                             struct mlx5_ib_qp *qp)
 {
+       enum rdma_link_layer ll = rdma_port_get_link_layer(qp->ibqp.device, 1);
        struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device);
        struct mlx5_ib_srq *srq;
        struct mlx5_ib_wq *wq;
@@ -236,6 +234,22 @@ static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe,
        } else {
                wc->pkey_index = 0;
        }
+
+       if (ll != IB_LINK_LAYER_ETHERNET)
+               return;
+
+       switch (wc->sl & 0x3) {
+       case MLX5_CQE_ROCE_L3_HEADER_TYPE_GRH:
+               wc->network_hdr_type = RDMA_NETWORK_IB;
+               break;
+       case MLX5_CQE_ROCE_L3_HEADER_TYPE_IPV6:
+               wc->network_hdr_type = RDMA_NETWORK_IPV6;
+               break;
+       case MLX5_CQE_ROCE_L3_HEADER_TYPE_IPV4:
+               wc->network_hdr_type = RDMA_NETWORK_IPV4;
+               break;
+       }
+       wc->wc_flags |= IB_WC_WITH_NETWORK_HDR_TYPE;
 }
 
 static void dump_cqe(struct mlx5_ib_dev *dev, struct mlx5_err_cqe *cqe)
@@ -760,12 +774,12 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
        int eqn;
        int err;
 
-       if (attr->flags)
-               return ERR_PTR(-EINVAL);
-
        if (entries < 0)
                return ERR_PTR(-EINVAL);
 
+       if (check_cq_create_flags(attr->flags))
+               return ERR_PTR(-EOPNOTSUPP);
+
        entries = roundup_pow_of_two(entries + 1);
        if (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)))
                return ERR_PTR(-EINVAL);
@@ -779,6 +793,7 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
        spin_lock_init(&cq->lock);
        cq->resize_buf = NULL;
        cq->resize_umem = NULL;
+       cq->create_flags = attr->flags;
 
        if (context) {
                err = create_cq_user(dev, udata, context, cq, entries,
@@ -796,6 +811,10 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
 
        cq->cqe_size = cqe_size;
        cqb->ctx.cqe_sz_flags = cqe_sz_to_mlx_sz(cqe_size) << 5;
+
+       if (cq->create_flags & IB_CQ_FLAGS_IGNORE_OVERRUN)
+               cqb->ctx.cqe_sz_flags |= (1 << 1);
+
        cqb->ctx.log_sz_usr_page = cpu_to_be32((ilog2(entries) << 24) | index);
        err = mlx5_vector2eqn(dev->mdev, vector, &eqn, &irqn);
        if (err)
index b0ec175cc6ba3b4c63ec8c5e22082a3603b5b473..ec737e2287fe150ff8d5d83e6ca7042a2cca8b62 100644 (file)
@@ -40,6 +40,8 @@
 #include <linux/io-mapping.h>
 #include <linux/sched.h>
 #include <rdma/ib_user_verbs.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
 #include <linux/mlx5/vport.h>
 #include <rdma/ib_smi.h>
 #include <rdma/ib_umem.h>
@@ -66,12 +68,14 @@ static char mlx5_version[] =
        DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v"
        DRIVER_VERSION " (" DRIVER_RELDATE ")\n";
 
+enum {
+       MLX5_ATOMIC_SIZE_QP_8BYTES = 1 << 3,
+};
+
 static enum rdma_link_layer
-mlx5_ib_port_link_layer(struct ib_device *device)
+mlx5_port_type_cap_to_rdma_ll(int port_type_cap)
 {
-       struct mlx5_ib_dev *dev = to_mdev(device);
-
-       switch (MLX5_CAP_GEN(dev->mdev, port_type)) {
+       switch (port_type_cap) {
        case MLX5_CAP_PORT_TYPE_IB:
                return IB_LINK_LAYER_INFINIBAND;
        case MLX5_CAP_PORT_TYPE_ETH:
@@ -81,6 +85,202 @@ mlx5_ib_port_link_layer(struct ib_device *device)
        }
 }
 
+static enum rdma_link_layer
+mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num)
+{
+       struct mlx5_ib_dev *dev = to_mdev(device);
+       int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type);
+
+       return mlx5_port_type_cap_to_rdma_ll(port_type_cap);
+}
+
+static int mlx5_netdev_event(struct notifier_block *this,
+                            unsigned long event, void *ptr)
+{
+       struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
+       struct mlx5_ib_dev *ibdev = container_of(this, struct mlx5_ib_dev,
+                                                roce.nb);
+
+       if ((event != NETDEV_UNREGISTER) && (event != NETDEV_REGISTER))
+               return NOTIFY_DONE;
+
+       write_lock(&ibdev->roce.netdev_lock);
+       if (ndev->dev.parent == &ibdev->mdev->pdev->dev)
+               ibdev->roce.netdev = (event == NETDEV_UNREGISTER) ? NULL : ndev;
+       write_unlock(&ibdev->roce.netdev_lock);
+
+       return NOTIFY_DONE;
+}
+
+static struct net_device *mlx5_ib_get_netdev(struct ib_device *device,
+                                            u8 port_num)
+{
+       struct mlx5_ib_dev *ibdev = to_mdev(device);
+       struct net_device *ndev;
+
+       /* Ensure ndev does not disappear before we invoke dev_hold()
+        */
+       read_lock(&ibdev->roce.netdev_lock);
+       ndev = ibdev->roce.netdev;
+       if (ndev)
+               dev_hold(ndev);
+       read_unlock(&ibdev->roce.netdev_lock);
+
+       return ndev;
+}
+
+static int mlx5_query_port_roce(struct ib_device *device, u8 port_num,
+                               struct ib_port_attr *props)
+{
+       struct mlx5_ib_dev *dev = to_mdev(device);
+       struct net_device *ndev;
+       enum ib_mtu ndev_ib_mtu;
+       u16 qkey_viol_cntr;
+
+       memset(props, 0, sizeof(*props));
+
+       props->port_cap_flags  |= IB_PORT_CM_SUP;
+       props->port_cap_flags  |= IB_PORT_IP_BASED_GIDS;
+
+       props->gid_tbl_len      = MLX5_CAP_ROCE(dev->mdev,
+                                               roce_address_table_size);
+       props->max_mtu          = IB_MTU_4096;
+       props->max_msg_sz       = 1 << MLX5_CAP_GEN(dev->mdev, log_max_msg);
+       props->pkey_tbl_len     = 1;
+       props->state            = IB_PORT_DOWN;
+       props->phys_state       = 3;
+
+       mlx5_query_nic_vport_qkey_viol_cntr(dev->mdev, &qkey_viol_cntr);
+       props->qkey_viol_cntr = qkey_viol_cntr;
+
+       ndev = mlx5_ib_get_netdev(device, port_num);
+       if (!ndev)
+               return 0;
+
+       if (netif_running(ndev) && netif_carrier_ok(ndev)) {
+               props->state      = IB_PORT_ACTIVE;
+               props->phys_state = 5;
+       }
+
+       ndev_ib_mtu = iboe_get_mtu(ndev->mtu);
+
+       dev_put(ndev);
+
+       props->active_mtu       = min(props->max_mtu, ndev_ib_mtu);
+
+       props->active_width     = IB_WIDTH_4X;  /* TODO */
+       props->active_speed     = IB_SPEED_QDR; /* TODO */
+
+       return 0;
+}
+
+static void ib_gid_to_mlx5_roce_addr(const union ib_gid *gid,
+                                    const struct ib_gid_attr *attr,
+                                    void *mlx5_addr)
+{
+#define MLX5_SET_RA(p, f, v) MLX5_SET(roce_addr_layout, p, f, v)
+       char *mlx5_addr_l3_addr = MLX5_ADDR_OF(roce_addr_layout, mlx5_addr,
+                                              source_l3_address);
+       void *mlx5_addr_mac     = MLX5_ADDR_OF(roce_addr_layout, mlx5_addr,
+                                              source_mac_47_32);
+
+       if (!gid)
+               return;
+
+       ether_addr_copy(mlx5_addr_mac, attr->ndev->dev_addr);
+
+       if (is_vlan_dev(attr->ndev)) {
+               MLX5_SET_RA(mlx5_addr, vlan_valid, 1);
+               MLX5_SET_RA(mlx5_addr, vlan_id, vlan_dev_vlan_id(attr->ndev));
+       }
+
+       switch (attr->gid_type) {
+       case IB_GID_TYPE_IB:
+               MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_1);
+               break;
+       case IB_GID_TYPE_ROCE_UDP_ENCAP:
+               MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_2);
+               break;
+
+       default:
+               WARN_ON(true);
+       }
+
+       if (attr->gid_type != IB_GID_TYPE_IB) {
+               if (ipv6_addr_v4mapped((void *)gid))
+                       MLX5_SET_RA(mlx5_addr, roce_l3_type,
+                                   MLX5_ROCE_L3_TYPE_IPV4);
+               else
+                       MLX5_SET_RA(mlx5_addr, roce_l3_type,
+                                   MLX5_ROCE_L3_TYPE_IPV6);
+       }
+
+       if ((attr->gid_type == IB_GID_TYPE_IB) ||
+           !ipv6_addr_v4mapped((void *)gid))
+               memcpy(mlx5_addr_l3_addr, gid, sizeof(*gid));
+       else
+               memcpy(&mlx5_addr_l3_addr[12], &gid->raw[12], 4);
+}
+
+static int set_roce_addr(struct ib_device *device, u8 port_num,
+                        unsigned int index,
+                        const union ib_gid *gid,
+                        const struct ib_gid_attr *attr)
+{
+       struct mlx5_ib_dev *dev = to_mdev(device);
+       u32  in[MLX5_ST_SZ_DW(set_roce_address_in)];
+       u32 out[MLX5_ST_SZ_DW(set_roce_address_out)];
+       void *in_addr = MLX5_ADDR_OF(set_roce_address_in, in, roce_address);
+       enum rdma_link_layer ll = mlx5_ib_port_link_layer(device, port_num);
+
+       if (ll != IB_LINK_LAYER_ETHERNET)
+               return -EINVAL;
+
+       memset(in, 0, sizeof(in));
+
+       ib_gid_to_mlx5_roce_addr(gid, attr, in_addr);
+
+       MLX5_SET(set_roce_address_in, in, roce_address_index, index);
+       MLX5_SET(set_roce_address_in, in, opcode, MLX5_CMD_OP_SET_ROCE_ADDRESS);
+
+       memset(out, 0, sizeof(out));
+       return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out));
+}
+
+static int mlx5_ib_add_gid(struct ib_device *device, u8 port_num,
+                          unsigned int index, const union ib_gid *gid,
+                          const struct ib_gid_attr *attr,
+                          __always_unused void **context)
+{
+       return set_roce_addr(device, port_num, index, gid, attr);
+}
+
+static int mlx5_ib_del_gid(struct ib_device *device, u8 port_num,
+                          unsigned int index, __always_unused void **context)
+{
+       return set_roce_addr(device, port_num, index, NULL, NULL);
+}
+
+__be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num,
+                              int index)
+{
+       struct ib_gid_attr attr;
+       union ib_gid gid;
+
+       if (ib_get_cached_gid(&dev->ib_dev, port_num, index, &gid, &attr))
+               return 0;
+
+       if (!attr.ndev)
+               return 0;
+
+       dev_put(attr.ndev);
+
+       if (attr.gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP)
+               return 0;
+
+       return cpu_to_be16(MLX5_CAP_ROCE(dev->mdev, r_roce_min_src_udp_port));
+}
+
 static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev)
 {
        return !dev->mdev->issi;
@@ -97,13 +297,35 @@ static int mlx5_get_vport_access_method(struct ib_device *ibdev)
        if (mlx5_use_mad_ifc(to_mdev(ibdev)))
                return MLX5_VPORT_ACCESS_METHOD_MAD;
 
-       if (mlx5_ib_port_link_layer(ibdev) ==
+       if (mlx5_ib_port_link_layer(ibdev, 1) ==
            IB_LINK_LAYER_ETHERNET)
                return MLX5_VPORT_ACCESS_METHOD_NIC;
 
        return MLX5_VPORT_ACCESS_METHOD_HCA;
 }
 
+static void get_atomic_caps(struct mlx5_ib_dev *dev,
+                           struct ib_device_attr *props)
+{
+       u8 tmp;
+       u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations);
+       u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp);
+       u8 atomic_req_8B_endianness_mode =
+               MLX5_CAP_ATOMIC(dev->mdev, atomic_req_8B_endianess_mode);
+
+       /* Check if HW supports 8 bytes standard atomic operations and capable
+        * of host endianness respond
+        */
+       tmp = MLX5_ATOMIC_OPS_CMP_SWAP | MLX5_ATOMIC_OPS_FETCH_ADD;
+       if (((atomic_operations & tmp) == tmp) &&
+           (atomic_size_qp & MLX5_ATOMIC_SIZE_QP_8BYTES) &&
+           (atomic_req_8B_endianness_mode)) {
+               props->atomic_cap = IB_ATOMIC_HCA;
+       } else {
+               props->atomic_cap = IB_ATOMIC_NONE;
+       }
+}
+
 static int mlx5_query_system_image_guid(struct ib_device *ibdev,
                                        __be64 *sys_image_guid)
 {
@@ -119,13 +341,21 @@ static int mlx5_query_system_image_guid(struct ib_device *ibdev,
 
        case MLX5_VPORT_ACCESS_METHOD_HCA:
                err = mlx5_query_hca_vport_system_image_guid(mdev, &tmp);
-               if (!err)
-                       *sys_image_guid = cpu_to_be64(tmp);
-               return err;
+               break;
+
+       case MLX5_VPORT_ACCESS_METHOD_NIC:
+               err = mlx5_query_nic_vport_system_image_guid(mdev, &tmp);
+               break;
 
        default:
                return -EINVAL;
        }
+
+       if (!err)
+               *sys_image_guid = cpu_to_be64(tmp);
+
+       return err;
+
 }
 
 static int mlx5_query_max_pkeys(struct ib_device *ibdev,
@@ -179,13 +409,20 @@ static int mlx5_query_node_guid(struct mlx5_ib_dev *dev,
 
        case MLX5_VPORT_ACCESS_METHOD_HCA:
                err = mlx5_query_hca_vport_node_guid(dev->mdev, &tmp);
-               if (!err)
-                       *node_guid = cpu_to_be64(tmp);
-               return err;
+               break;
+
+       case MLX5_VPORT_ACCESS_METHOD_NIC:
+               err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp);
+               break;
 
        default:
                return -EINVAL;
        }
+
+       if (!err)
+               *node_guid = cpu_to_be64(tmp);
+
+       return err;
 }
 
 struct mlx5_reg_node_desc {
@@ -263,6 +500,10 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
        if (MLX5_CAP_GEN(mdev, block_lb_mc))
                props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
 
+       if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
+           (MLX5_CAP_ETH(dev->mdev, csum_cap)))
+                       props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM;
+
        props->vendor_part_id      = mdev->pdev->device;
        props->hw_ver              = mdev->pdev->revision;
 
@@ -278,7 +519,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
        props->max_sge = min(max_rq_sg, max_sq_sg);
        props->max_sge_rd = props->max_sge;
        props->max_cq              = 1 << MLX5_CAP_GEN(mdev, log_max_cq);
-       props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_eq_sz)) - 1;
+       props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1;
        props->max_mr              = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
        props->max_pd              = 1 << MLX5_CAP_GEN(mdev, log_max_pd);
        props->max_qp_rd_atom      = 1 << MLX5_CAP_GEN(mdev, log_max_ra_req_qp);
@@ -289,13 +530,15 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
        props->max_res_rd_atom     = props->max_qp_rd_atom * props->max_qp;
        props->max_srq_sge         = max_rq_sg - 1;
        props->max_fast_reg_page_list_len = (unsigned int)-1;
-       props->atomic_cap          = IB_ATOMIC_NONE;
+       get_atomic_caps(dev, props);
        props->masked_atomic_cap   = IB_ATOMIC_NONE;
        props->max_mcast_grp       = 1 << MLX5_CAP_GEN(mdev, log_max_mcg);
        props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg);
        props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
                                           props->max_mcast_grp;
        props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */
+       props->hca_core_clock = MLX5_CAP_GEN(mdev, device_frequency_khz);
+       props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL;
 
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
        if (MLX5_CAP_GEN(mdev, pg))
@@ -303,6 +546,9 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
        props->odp_caps = dev->odp_caps;
 #endif
 
+       if (MLX5_CAP_GEN(mdev, cd))
+               props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL;
+
        return 0;
 }
 
@@ -483,6 +729,9 @@ int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
        case MLX5_VPORT_ACCESS_METHOD_HCA:
                return mlx5_query_hca_port(ibdev, port, props);
 
+       case MLX5_VPORT_ACCESS_METHOD_NIC:
+               return mlx5_query_port_roce(ibdev, port, props);
+
        default:
                return -EINVAL;
        }
@@ -583,8 +832,8 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
                                                  struct ib_udata *udata)
 {
        struct mlx5_ib_dev *dev = to_mdev(ibdev);
-       struct mlx5_ib_alloc_ucontext_req_v2 req;
-       struct mlx5_ib_alloc_ucontext_resp resp;
+       struct mlx5_ib_alloc_ucontext_req_v2 req = {};
+       struct mlx5_ib_alloc_ucontext_resp resp = {};
        struct mlx5_ib_ucontext *context;
        struct mlx5_uuar_info *uuari;
        struct mlx5_uar *uars;
@@ -599,20 +848,22 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
        if (!dev->ib_active)
                return ERR_PTR(-EAGAIN);
 
-       memset(&req, 0, sizeof(req));
+       if (udata->inlen < sizeof(struct ib_uverbs_cmd_hdr))
+               return ERR_PTR(-EINVAL);
+
        reqlen = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr);
        if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req))
                ver = 0;
-       else if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req_v2))
+       else if (reqlen >= sizeof(struct mlx5_ib_alloc_ucontext_req_v2))
                ver = 2;
        else
                return ERR_PTR(-EINVAL);
 
-       err = ib_copy_from_udata(&req, udata, reqlen);
+       err = ib_copy_from_udata(&req, udata, min(reqlen, sizeof(req)));
        if (err)
                return ERR_PTR(err);
 
-       if (req.flags || req.reserved)
+       if (req.flags)
                return ERR_PTR(-EINVAL);
 
        if (req.total_num_uuars > MLX5_MAX_UUARS)
@@ -621,6 +872,14 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
        if (req.total_num_uuars == 0)
                return ERR_PTR(-EINVAL);
 
+       if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2)
+               return ERR_PTR(-EOPNOTSUPP);
+
+       if (reqlen > sizeof(req) &&
+           !ib_is_udata_cleared(udata, sizeof(req),
+                                reqlen - sizeof(req)))
+               return ERR_PTR(-EOPNOTSUPP);
+
        req.total_num_uuars = ALIGN(req.total_num_uuars,
                                    MLX5_NON_FP_BF_REGS_PER_PAGE);
        if (req.num_low_latency_uuars > req.total_num_uuars - 1)
@@ -636,6 +895,11 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
        resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
        resp.max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
        resp.max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
+       resp.cqe_version = min_t(__u8,
+                                (__u8)MLX5_CAP_GEN(dev->mdev, cqe_version),
+                                req.max_cqe_version);
+       resp.response_length = min(offsetof(typeof(resp), response_length) +
+                                  sizeof(resp.response_length), udata->outlen);
 
        context = kzalloc(sizeof(*context), GFP_KERNEL);
        if (!context)
@@ -681,22 +945,49 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
        context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range;
 #endif
 
+       if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) {
+               err = mlx5_core_alloc_transport_domain(dev->mdev,
+                                                      &context->tdn);
+               if (err)
+                       goto out_uars;
+       }
+
        INIT_LIST_HEAD(&context->db_page_list);
        mutex_init(&context->db_page_mutex);
 
        resp.tot_uuars = req.total_num_uuars;
        resp.num_ports = MLX5_CAP_GEN(dev->mdev, num_ports);
-       err = ib_copy_to_udata(udata, &resp,
-                              sizeof(resp) - sizeof(resp.reserved));
+
+       if (field_avail(typeof(resp), cqe_version, udata->outlen))
+               resp.response_length += sizeof(resp.cqe_version);
+
+       if (field_avail(typeof(resp), hca_core_clock_offset, udata->outlen)) {
+               resp.comp_mask |=
+                       MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET;
+               resp.hca_core_clock_offset =
+                       offsetof(struct mlx5_init_seg, internal_timer_h) %
+                       PAGE_SIZE;
+               resp.response_length += sizeof(resp.hca_core_clock_offset) +
+                                       sizeof(resp.reserved2) +
+                                       sizeof(resp.reserved3);
+       }
+
+       err = ib_copy_to_udata(udata, &resp, resp.response_length);
        if (err)
-               goto out_uars;
+               goto out_td;
 
        uuari->ver = ver;
        uuari->num_low_latency_uuars = req.num_low_latency_uuars;
        uuari->uars = uars;
        uuari->num_uars = num_uars;
+       context->cqe_version = resp.cqe_version;
+
        return &context->ibucontext;
 
+out_td:
+       if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
+               mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn);
+
 out_uars:
        for (i--; i >= 0; i--)
                mlx5_cmd_free_uar(dev->mdev, uars[i].index);
@@ -721,6 +1012,9 @@ static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
        struct mlx5_uuar_info *uuari = &context->uuari;
        int i;
 
+       if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
+               mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn);
+
        for (i = 0; i < uuari->num_uars; i++) {
                if (mlx5_cmd_free_uar(dev->mdev, uuari->uars[i].index))
                        mlx5_ib_warn(dev, "failed to free UAR 0x%x\n", uuari->uars[i].index);
@@ -790,6 +1084,30 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm
        case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES:
                return -ENOSYS;
 
+       case MLX5_IB_MMAP_CORE_CLOCK:
+               if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+                       return -EINVAL;
+
+               if (vma->vm_flags & (VM_WRITE | VM_EXEC))
+                       return -EPERM;
+
+               /* Don't expose to user-space information it shouldn't have */
+               if (PAGE_SIZE > 4096)
+                       return -EOPNOTSUPP;
+
+               vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+               pfn = (dev->mdev->iseg_base +
+                      offsetof(struct mlx5_init_seg, internal_timer_h)) >>
+                       PAGE_SHIFT;
+               if (io_remap_pfn_range(vma, vma->vm_start, pfn,
+                                      PAGE_SIZE, vma->vm_page_prot))
+                       return -EAGAIN;
+
+               mlx5_ib_dbg(dev, "mapped internal timer at 0x%lx, PA 0x%llx\n",
+                           vma->vm_start,
+                           (unsigned long long)pfn << PAGE_SHIFT);
+               break;
+
        default:
                return -EINVAL;
        }
@@ -1758,6 +2076,32 @@ static void destroy_dev_resources(struct mlx5_ib_resources *devr)
        mlx5_ib_dealloc_pd(devr->p0);
 }
 
+static u32 get_core_cap_flags(struct ib_device *ibdev)
+{
+       struct mlx5_ib_dev *dev = to_mdev(ibdev);
+       enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, 1);
+       u8 l3_type_cap = MLX5_CAP_ROCE(dev->mdev, l3_type);
+       u8 roce_version_cap = MLX5_CAP_ROCE(dev->mdev, roce_version);
+       u32 ret = 0;
+
+       if (ll == IB_LINK_LAYER_INFINIBAND)
+               return RDMA_CORE_PORT_IBA_IB;
+
+       if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP))
+               return 0;
+
+       if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV6_CAP))
+               return 0;
+
+       if (roce_version_cap & MLX5_ROCE_VERSION_1_CAP)
+               ret |= RDMA_CORE_PORT_IBA_ROCE;
+
+       if (roce_version_cap & MLX5_ROCE_VERSION_2_CAP)
+               ret |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
+
+       return ret;
+}
+
 static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num,
                               struct ib_port_immutable *immutable)
 {
@@ -1770,20 +2114,50 @@ static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num,
 
        immutable->pkey_tbl_len = attr.pkey_tbl_len;
        immutable->gid_tbl_len = attr.gid_tbl_len;
-       immutable->core_cap_flags = RDMA_CORE_PORT_IBA_IB;
+       immutable->core_cap_flags = get_core_cap_flags(ibdev);
        immutable->max_mad_size = IB_MGMT_MAD_SIZE;
 
        return 0;
 }
 
+static int mlx5_enable_roce(struct mlx5_ib_dev *dev)
+{
+       int err;
+
+       dev->roce.nb.notifier_call = mlx5_netdev_event;
+       err = register_netdevice_notifier(&dev->roce.nb);
+       if (err)
+               return err;
+
+       err = mlx5_nic_vport_enable_roce(dev->mdev);
+       if (err)
+               goto err_unregister_netdevice_notifier;
+
+       return 0;
+
+err_unregister_netdevice_notifier:
+       unregister_netdevice_notifier(&dev->roce.nb);
+       return err;
+}
+
+static void mlx5_disable_roce(struct mlx5_ib_dev *dev)
+{
+       mlx5_nic_vport_disable_roce(dev->mdev);
+       unregister_netdevice_notifier(&dev->roce.nb);
+}
+
 static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 {
        struct mlx5_ib_dev *dev;
+       enum rdma_link_layer ll;
+       int port_type_cap;
        int err;
        int i;
 
-       /* don't create IB instance over Eth ports, no RoCE yet! */
-       if (MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH)
+       port_type_cap = MLX5_CAP_GEN(mdev, port_type);
+       ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
+
+       if ((ll == IB_LINK_LAYER_ETHERNET) && !MLX5_CAP_GEN(mdev, roce))
                return NULL;
 
        printk_once(KERN_INFO "%s", mlx5_version);
@@ -1794,6 +2168,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 
        dev->mdev = mdev;
 
+       rwlock_init(&dev->roce.netdev_lock);
        err = get_port_caps(dev);
        if (err)
                goto err_dealloc;
@@ -1843,7 +2218,12 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 
        dev->ib_dev.query_device        = mlx5_ib_query_device;
        dev->ib_dev.query_port          = mlx5_ib_query_port;
+       dev->ib_dev.get_link_layer      = mlx5_ib_port_link_layer;
+       if (ll == IB_LINK_LAYER_ETHERNET)
+               dev->ib_dev.get_netdev  = mlx5_ib_get_netdev;
        dev->ib_dev.query_gid           = mlx5_ib_query_gid;
+       dev->ib_dev.add_gid             = mlx5_ib_add_gid;
+       dev->ib_dev.del_gid             = mlx5_ib_del_gid;
        dev->ib_dev.query_pkey          = mlx5_ib_query_pkey;
        dev->ib_dev.modify_device       = mlx5_ib_modify_device;
        dev->ib_dev.modify_port         = mlx5_ib_modify_port;
@@ -1893,7 +2273,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
                        (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
        }
 
-       if (mlx5_ib_port_link_layer(&dev->ib_dev) ==
+       if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) ==
            IB_LINK_LAYER_ETHERNET) {
                dev->ib_dev.create_flow = mlx5_ib_create_flow;
                dev->ib_dev.destroy_flow = mlx5_ib_destroy_flow;
@@ -1908,9 +2288,15 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
        mutex_init(&dev->flow_db.lock);
        mutex_init(&dev->cap_mask_mutex);
 
+       if (ll == IB_LINK_LAYER_ETHERNET) {
+               err = mlx5_enable_roce(dev);
+               if (err)
+                       goto err_dealloc;
+       }
+
        err = create_dev_resources(&dev->devr);
        if (err)
-               goto err_dealloc;
+               goto err_disable_roce;
 
        err = mlx5_ib_odp_init_one(dev);
        if (err)
@@ -1947,6 +2333,10 @@ err_odp:
 err_rsrc:
        destroy_dev_resources(&dev->devr);
 
+err_disable_roce:
+       if (ll == IB_LINK_LAYER_ETHERNET)
+               mlx5_disable_roce(dev);
+
 err_dealloc:
        ib_dealloc_device((struct ib_device *)dev);
 
@@ -1956,11 +2346,14 @@ err_dealloc:
 static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
 {
        struct mlx5_ib_dev *dev = context;
+       enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, 1);
 
        ib_unregister_device(&dev->ib_dev);
        destroy_umrc_res(dev);
        mlx5_ib_odp_remove_one(dev);
        destroy_dev_resources(&dev->devr);
+       if (ll == IB_LINK_LAYER_ETHERNET)
+               mlx5_disable_roce(dev);
        ib_dealloc_device(&dev->ib_dev);
 }
 
index 1474cccd1e0f3c4b5e9bc5b6575dc32fc8535349..d2b9737baa3675b1dc9ced794767f1790ada35ee 100644 (file)
@@ -42,6 +42,7 @@
 #include <linux/mlx5/qp.h>
 #include <linux/mlx5/srq.h>
 #include <linux/types.h>
+#include <linux/mlx5/transobj.h>
 
 #define mlx5_ib_dbg(dev, format, arg...)                               \
 pr_debug("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__,   \
@@ -55,6 +56,11 @@ pr_err("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__,   \
 pr_warn("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__,    \
        __LINE__, current->pid, ##arg)
 
+#define field_avail(type, fld, sz) (offsetof(type, fld) +              \
+                                   sizeof(((type *)0)->fld) <= (sz))
+#define MLX5_IB_DEFAULT_UIDX 0xffffff
+#define MLX5_USER_ASSIGNED_UIDX_MASK __mlx5_mask(qpc, user_index)
+
 enum {
        MLX5_IB_MMAP_CMD_SHIFT  = 8,
        MLX5_IB_MMAP_CMD_MASK   = 0xff,
@@ -62,7 +68,9 @@ enum {
 
 enum mlx5_ib_mmap_cmd {
        MLX5_IB_MMAP_REGULAR_PAGE               = 0,
-       MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES       = 1, /* always last */
+       MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES       = 1,
+       /* 5 is chosen in order to be compatible with old versions of libmlx5 */
+       MLX5_IB_MMAP_CORE_CLOCK                 = 5,
 };
 
 enum {
@@ -85,6 +93,15 @@ enum mlx5_ib_mad_ifc_flags {
        MLX5_MAD_IFC_NET_VIEW           = 4,
 };
 
+enum {
+       MLX5_CROSS_CHANNEL_UUAR         = 0,
+};
+
+enum {
+       MLX5_CQE_VERSION_V0,
+       MLX5_CQE_VERSION_V1,
+};
+
 struct mlx5_ib_ucontext {
        struct ib_ucontext      ibucontext;
        struct list_head        db_page_list;
@@ -93,6 +110,9 @@ struct mlx5_ib_ucontext {
         */
        struct mutex            db_page_mutex;
        struct mlx5_uuar_info   uuari;
+       u8                      cqe_version;
+       /* Transport Domain number */
+       u32                     tdn;
 };
 
 static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext)
@@ -201,47 +221,70 @@ struct mlx5_ib_pfault {
        struct mlx5_pagefault   mpfault;
 };
 
+struct mlx5_ib_ubuffer {
+       struct ib_umem         *umem;
+       int                     buf_size;
+       u64                     buf_addr;
+};
+
+struct mlx5_ib_qp_base {
+       struct mlx5_ib_qp       *container_mibqp;
+       struct mlx5_core_qp     mqp;
+       struct mlx5_ib_ubuffer  ubuffer;
+};
+
+struct mlx5_ib_qp_trans {
+       struct mlx5_ib_qp_base  base;
+       u16                     xrcdn;
+       u8                      alt_port;
+       u8                      atomic_rd_en;
+       u8                      resp_depth;
+};
+
 struct mlx5_ib_rq {
+       struct mlx5_ib_qp_base base;
+       struct mlx5_ib_wq       *rq;
+       struct mlx5_ib_ubuffer  ubuffer;
+       struct mlx5_db          *doorbell;
        u32                     tirn;
+       u8                      state;
+};
+
+struct mlx5_ib_sq {
+       struct mlx5_ib_qp_base base;
+       struct mlx5_ib_wq       *sq;
+       struct mlx5_ib_ubuffer  ubuffer;
+       struct mlx5_db          *doorbell;
+       u32                     tisn;
+       u8                      state;
 };
 
 struct mlx5_ib_raw_packet_qp {
+       struct mlx5_ib_sq sq;
        struct mlx5_ib_rq rq;
 };
 
 struct mlx5_ib_qp {
        struct ib_qp            ibqp;
        union {
-               struct mlx5_core_qp             mqp;
-               struct mlx5_ib_raw_packet_qp    raw_packet_qp;
+               struct mlx5_ib_qp_trans trans_qp;
+               struct mlx5_ib_raw_packet_qp raw_packet_qp;
        };
-
        struct mlx5_buf         buf;
 
        struct mlx5_db          db;
        struct mlx5_ib_wq       rq;
 
-       u32                     doorbell_qpn;
        u8                      sq_signal_bits;
        u8                      fm_cache;
-       int                     sq_max_wqes_per_wr;
-       int                     sq_spare_wqes;
        struct mlx5_ib_wq       sq;
 
-       struct ib_umem         *umem;
-       int                     buf_size;
-
        /* serialize qp state modifications
         */
        struct mutex            mutex;
-       u16                     xrcdn;
        u32                     flags;
        u8                      port;
-       u8                      alt_port;
-       u8                      atomic_rd_en;
-       u8                      resp_depth;
        u8                      state;
-       int                     mlx_type;
        int                     wq_sig;
        int                     scat_cqe;
        int                     max_inline_data;
@@ -284,6 +327,9 @@ struct mlx5_ib_cq_buf {
 enum mlx5_ib_qp_flags {
        MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK     = 1 << 0,
        MLX5_IB_QP_SIGNATURE_HANDLING           = 1 << 1,
+       MLX5_IB_QP_CROSS_CHANNEL                = 1 << 2,
+       MLX5_IB_QP_MANAGED_SEND                 = 1 << 3,
+       MLX5_IB_QP_MANAGED_RECV                 = 1 << 4,
 };
 
 struct mlx5_umr_wr {
@@ -326,6 +372,7 @@ struct mlx5_ib_cq {
        struct mlx5_ib_cq_buf  *resize_buf;
        struct ib_umem         *resize_umem;
        int                     cqe_size;
+       u32                     create_flags;
 };
 
 struct mlx5_ib_srq {
@@ -449,9 +496,19 @@ struct mlx5_ib_resources {
        struct ib_srq   *s1;
 };
 
+struct mlx5_roce {
+       /* Protect mlx5_ib_get_netdev from invoking dev_hold() with a NULL
+        * netdev pointer
+        */
+       rwlock_t                netdev_lock;
+       struct net_device       *netdev;
+       struct notifier_block   nb;
+};
+
 struct mlx5_ib_dev {
        struct ib_device                ib_dev;
        struct mlx5_core_dev            *mdev;
+       struct mlx5_roce                roce;
        MLX5_DECLARE_DOORBELL_LOCK(uar_lock);
        int                             num_ports;
        /* serialize update of capability mask
@@ -498,7 +555,7 @@ static inline struct mlx5_ib_cq *to_mcq(struct ib_cq *ibcq)
 
 static inline struct mlx5_ib_qp *to_mibqp(struct mlx5_core_qp *mqp)
 {
-       return container_of(mqp, struct mlx5_ib_qp, mqp);
+       return container_of(mqp, struct mlx5_ib_qp_base, mqp)->container_mibqp;
 }
 
 static inline struct mlx5_ib_mr *to_mibmr(struct mlx5_core_mr *mmr)
@@ -550,8 +607,6 @@ void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index);
 int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey,
                 u8 port, const struct ib_wc *in_wc, const struct ib_grh *in_grh,
                 const void *in_mad, void *response_mad);
-struct ib_ah *create_ib_ah(struct ib_ah_attr *ah_attr,
-                          struct mlx5_ib_ah *ah);
 struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr);
 int mlx5_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr);
 int mlx5_ib_destroy_ah(struct ib_ah *ah);
@@ -578,7 +633,8 @@ int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
                      struct ib_recv_wr **bad_wr);
 void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n);
 int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index,
-                         void *buffer, u32 length);
+                         void *buffer, u32 length,
+                         struct mlx5_ib_qp_base *base);
 struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
                                const struct ib_cq_init_attr *attr,
                                struct ib_ucontext *context,
@@ -680,6 +736,9 @@ static inline void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp)  {}
 
 #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
 
+__be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num,
+                              int index);
+
 static inline void init_query_mad(struct ib_smp *mad)
 {
        mad->base_version  = 1;
@@ -705,4 +764,28 @@ static inline int is_qp1(enum ib_qp_type qp_type)
 #define MLX5_MAX_UMR_SHIFT 16
 #define MLX5_MAX_UMR_PAGES (1 << MLX5_MAX_UMR_SHIFT)
 
+static inline u32 check_cq_create_flags(u32 flags)
+{
+       /*
+        * It returns non-zero value for unsupported CQ
+        * create flags, otherwise it returns zero.
+        */
+       return (flags & ~(IB_CQ_FLAGS_IGNORE_OVERRUN |
+                         IB_CQ_FLAGS_TIMESTAMP_COMPLETION));
+}
+
+static inline int verify_assign_uidx(u8 cqe_version, u32 cmd_uidx,
+                                    u32 *user_index)
+{
+       if (cqe_version) {
+               if ((cmd_uidx == MLX5_IB_DEFAULT_UIDX) ||
+                   (cmd_uidx & ~MLX5_USER_ASSIGNED_UIDX_MASK))
+                       return -EINVAL;
+               *user_index = cmd_uidx;
+       } else {
+               *user_index = MLX5_IB_DEFAULT_UIDX;
+       }
+
+       return 0;
+}
 #endif /* MLX5_IB_H */
index aa8391e75385016bb11a2526f8660331193b7c4e..b8d76361a48dcfe29914ff13011b09d2893c851d 100644 (file)
@@ -153,14 +153,16 @@ static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev,
 
 static void mlx5_ib_page_fault_resume(struct mlx5_ib_qp *qp,
                                      struct mlx5_ib_pfault *pfault,
-                                     int error) {
+                                     int error)
+{
        struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device);
-       int ret = mlx5_core_page_fault_resume(dev->mdev, qp->mqp.qpn,
+       u32 qpn = qp->trans_qp.base.mqp.qpn;
+       int ret = mlx5_core_page_fault_resume(dev->mdev,
+                                             qpn,
                                              pfault->mpfault.flags,
                                              error);
        if (ret)
-               pr_err("Failed to resolve the page fault on QP 0x%x\n",
-                      qp->mqp.qpn);
+               pr_err("Failed to resolve the page fault on QP 0x%x\n", qpn);
 }
 
 /*
@@ -391,6 +393,7 @@ static int mlx5_ib_mr_initiator_pfault_handler(
 #if defined(DEBUG)
        u32 ctrl_wqe_index, ctrl_qpn;
 #endif
+       u32 qpn = qp->trans_qp.base.mqp.qpn;
 
        ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;
        if (ds * MLX5_WQE_DS_UNITS > wqe_length) {
@@ -401,7 +404,7 @@ static int mlx5_ib_mr_initiator_pfault_handler(
 
        if (ds == 0) {
                mlx5_ib_err(dev, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n",
-                           wqe_index, qp->mqp.qpn);
+                           wqe_index, qpn);
                return -EFAULT;
        }
 
@@ -411,16 +414,16 @@ static int mlx5_ib_mr_initiator_pfault_handler(
                        MLX5_WQE_CTRL_WQE_INDEX_SHIFT;
        if (wqe_index != ctrl_wqe_index) {
                mlx5_ib_err(dev, "Got WQE with invalid wqe_index. wqe_index=0x%x, qpn=0x%x ctrl->wqe_index=0x%x\n",
-                           wqe_index, qp->mqp.qpn,
+                           wqe_index, qpn,
                            ctrl_wqe_index);
                return -EFAULT;
        }
 
        ctrl_qpn = (be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_QPN_MASK) >>
                MLX5_WQE_CTRL_QPN_SHIFT;
-       if (qp->mqp.qpn != ctrl_qpn) {
+       if (qpn != ctrl_qpn) {
                mlx5_ib_err(dev, "Got WQE with incorrect QP number. wqe_index=0x%x, qpn=0x%x ctrl->qpn=0x%x\n",
-                           wqe_index, qp->mqp.qpn,
+                           wqe_index, qpn,
                            ctrl_qpn);
                return -EFAULT;
        }
@@ -537,6 +540,7 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_qp *qp,
        int resume_with_error = 0;
        u16 wqe_index = pfault->mpfault.wqe.wqe_index;
        int requestor = pfault->mpfault.flags & MLX5_PFAULT_REQUESTOR;
+       u32 qpn = qp->trans_qp.base.mqp.qpn;
 
        buffer = (char *)__get_free_page(GFP_KERNEL);
        if (!buffer) {
@@ -546,10 +550,10 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_qp *qp,
        }
 
        ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer,
-                                   PAGE_SIZE);
+                                   PAGE_SIZE, &qp->trans_qp.base);
        if (ret < 0) {
                mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%x, wqe_index=%x, qpn=%x\n",
-                           -ret, wqe_index, qp->mqp.qpn);
+                           -ret, wqe_index, qpn);
                resume_with_error = 1;
                goto resolve_page_fault;
        }
@@ -586,7 +590,8 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_qp *qp,
 resolve_page_fault:
        mlx5_ib_page_fault_resume(qp, pfault, resume_with_error);
        mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, flags: 0x%x\n",
-                   qp->mqp.qpn, resume_with_error, pfault->mpfault.flags);
+                   qpn, resume_with_error,
+                   pfault->mpfault.flags);
 
        free_page((unsigned long)buffer);
 }
@@ -753,7 +758,7 @@ void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp)
        qp->disable_page_faults = 1;
        spin_lock_init(&qp->disable_page_faults_lock);
 
-       qp->mqp.pfault_handler  = mlx5_ib_pfault_handler;
+       qp->trans_qp.base.mqp.pfault_handler = mlx5_ib_pfault_handler;
 
        for (i = 0; i < MLX5_IB_PAGEFAULT_CONTEXTS; ++i)
                INIT_WORK(&qp->pagefaults[i].work, mlx5_ib_qp_pfault_action);
index 307bdbca8938e1d87493d7048b93126bd9261e26..8fb9c27485e19959a09edf3b3bdf3f7c56557732 100644 (file)
@@ -32,6 +32,8 @@
 
 #include <linux/module.h>
 #include <rdma/ib_umem.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_user_verbs.h>
 #include "mlx5_ib.h"
 #include "user.h"
 
@@ -114,14 +116,15 @@ void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n)
  * Return: the number of bytes copied, or an error code.
  */
 int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index,
-                         void *buffer, u32 length)
+                         void *buffer, u32 length,
+                         struct mlx5_ib_qp_base *base)
 {
        struct ib_device *ibdev = qp->ibqp.device;
        struct mlx5_ib_dev *dev = to_mdev(ibdev);
        struct mlx5_ib_wq *wq = send ? &qp->sq : &qp->rq;
        size_t offset;
        size_t wq_end;
-       struct ib_umem *umem = qp->umem;
+       struct ib_umem *umem = base->ubuffer.umem;
        u32 first_copy_length;
        int wqe_length;
        int ret;
@@ -172,8 +175,10 @@ static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type)
        struct ib_qp *ibqp = &to_mibqp(qp)->ibqp;
        struct ib_event event;
 
-       if (type == MLX5_EVENT_TYPE_PATH_MIG)
-               to_mibqp(qp)->port = to_mibqp(qp)->alt_port;
+       if (type == MLX5_EVENT_TYPE_PATH_MIG) {
+               /* This event is only valid for trans_qps */
+               to_mibqp(qp)->port = to_mibqp(qp)->trans_qp.alt_port;
+       }
 
        if (ibqp->event_handler) {
                event.device     = ibqp->device;
@@ -366,7 +371,9 @@ static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr,
 
 static int set_user_buf_size(struct mlx5_ib_dev *dev,
                            struct mlx5_ib_qp *qp,
-                           struct mlx5_ib_create_qp *ucmd)
+                           struct mlx5_ib_create_qp *ucmd,
+                           struct mlx5_ib_qp_base *base,
+                           struct ib_qp_init_attr *attr)
 {
        int desc_sz = 1 << qp->sq.wqe_shift;
 
@@ -391,8 +398,13 @@ static int set_user_buf_size(struct mlx5_ib_dev *dev,
                return -EINVAL;
        }
 
-       qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
-               (qp->sq.wqe_cnt << 6);
+       if (attr->qp_type == IB_QPT_RAW_PACKET) {
+               base->ubuffer.buf_size = qp->rq.wqe_cnt << qp->rq.wqe_shift;
+               qp->raw_packet_qp.sq.ubuffer.buf_size = qp->sq.wqe_cnt << 6;
+       } else {
+               base->ubuffer.buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
+                                        (qp->sq.wqe_cnt << 6);
+       }
 
        return 0;
 }
@@ -578,8 +590,8 @@ static int to_mlx5_st(enum ib_qp_type type)
        case IB_QPT_SMI:                return MLX5_QP_ST_QP0;
        case IB_QPT_GSI:                return MLX5_QP_ST_QP1;
        case IB_QPT_RAW_IPV6:           return MLX5_QP_ST_RAW_IPV6;
-       case IB_QPT_RAW_ETHERTYPE:      return MLX5_QP_ST_RAW_ETHERTYPE;
        case IB_QPT_RAW_PACKET:
+       case IB_QPT_RAW_ETHERTYPE:      return MLX5_QP_ST_RAW_ETHERTYPE;
        case IB_QPT_MAX:
        default:                return -EINVAL;
        }
@@ -590,13 +602,51 @@ static int uuarn_to_uar_index(struct mlx5_uuar_info *uuari, int uuarn)
        return uuari->uars[uuarn / MLX5_BF_REGS_PER_PAGE].index;
 }
 
+static int mlx5_ib_umem_get(struct mlx5_ib_dev *dev,
+                           struct ib_pd *pd,
+                           unsigned long addr, size_t size,
+                           struct ib_umem **umem,
+                           int *npages, int *page_shift, int *ncont,
+                           u32 *offset)
+{
+       int err;
+
+       *umem = ib_umem_get(pd->uobject->context, addr, size, 0, 0);
+       if (IS_ERR(*umem)) {
+               mlx5_ib_dbg(dev, "umem_get failed\n");
+               return PTR_ERR(*umem);
+       }
+
+       mlx5_ib_cont_pages(*umem, addr, npages, page_shift, ncont, NULL);
+
+       err = mlx5_ib_get_buf_offset(addr, *page_shift, offset);
+       if (err) {
+               mlx5_ib_warn(dev, "bad offset\n");
+               goto err_umem;
+       }
+
+       mlx5_ib_dbg(dev, "addr 0x%lx, size %zu, npages %d, page_shift %d, ncont %d, offset %d\n",
+                   addr, size, *npages, *page_shift, *ncont, *offset);
+
+       return 0;
+
+err_umem:
+       ib_umem_release(*umem);
+       *umem = NULL;
+
+       return err;
+}
+
 static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
                          struct mlx5_ib_qp *qp, struct ib_udata *udata,
+                         struct ib_qp_init_attr *attr,
                          struct mlx5_create_qp_mbox_in **in,
-                         struct mlx5_ib_create_qp_resp *resp, int *inlen)
+                         struct mlx5_ib_create_qp_resp *resp, int *inlen,
+                         struct mlx5_ib_qp_base *base)
 {
        struct mlx5_ib_ucontext *context;
        struct mlx5_ib_create_qp ucmd;
+       struct mlx5_ib_ubuffer *ubuffer = &base->ubuffer;
        int page_shift = 0;
        int uar_index;
        int npages;
@@ -615,18 +665,23 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
        /*
         * TBD: should come from the verbs when we have the API
         */
-       uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_HIGH);
-       if (uuarn < 0) {
-               mlx5_ib_dbg(dev, "failed to allocate low latency UUAR\n");
-               mlx5_ib_dbg(dev, "reverting to medium latency\n");
-               uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_MEDIUM);
+       if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL)
+               /* In CROSS_CHANNEL CQ and QP must use the same UAR */
+               uuarn = MLX5_CROSS_CHANNEL_UUAR;
+       else {
+               uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_HIGH);
                if (uuarn < 0) {
-                       mlx5_ib_dbg(dev, "failed to allocate medium latency UUAR\n");
-                       mlx5_ib_dbg(dev, "reverting to high latency\n");
-                       uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_LOW);
+                       mlx5_ib_dbg(dev, "failed to allocate low latency UUAR\n");
+                       mlx5_ib_dbg(dev, "reverting to medium latency\n");
+                       uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_MEDIUM);
                        if (uuarn < 0) {
-                               mlx5_ib_warn(dev, "uuar allocation failed\n");
-                               return uuarn;
+                               mlx5_ib_dbg(dev, "failed to allocate medium latency UUAR\n");
+                               mlx5_ib_dbg(dev, "reverting to high latency\n");
+                               uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_LOW);
+                               if (uuarn < 0) {
+                                       mlx5_ib_warn(dev, "uuar allocation failed\n");
+                                       return uuarn;
+                               }
                        }
                }
        }
@@ -638,32 +693,20 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
        qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB);
        qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
 
-       err = set_user_buf_size(dev, qp, &ucmd);
+       err = set_user_buf_size(dev, qp, &ucmd, base, attr);
        if (err)
                goto err_uuar;
 
-       if (ucmd.buf_addr && qp->buf_size) {
-               qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr,
-                                      qp->buf_size, 0, 0);
-               if (IS_ERR(qp->umem)) {
-                       mlx5_ib_dbg(dev, "umem_get failed\n");
-                       err = PTR_ERR(qp->umem);
+       if (ucmd.buf_addr && ubuffer->buf_size) {
+               ubuffer->buf_addr = ucmd.buf_addr;
+               err = mlx5_ib_umem_get(dev, pd, ubuffer->buf_addr,
+                                      ubuffer->buf_size,
+                                      &ubuffer->umem, &npages, &page_shift,
+                                      &ncont, &offset);
+               if (err)
                        goto err_uuar;
-               }
        } else {
-               qp->umem = NULL;
-       }
-
-       if (qp->umem) {
-               mlx5_ib_cont_pages(qp->umem, ucmd.buf_addr, &npages, &page_shift,
-                                  &ncont, NULL);
-               err = mlx5_ib_get_buf_offset(ucmd.buf_addr, page_shift, &offset);
-               if (err) {
-                       mlx5_ib_warn(dev, "bad offset\n");
-                       goto err_umem;
-               }
-               mlx5_ib_dbg(dev, "addr 0x%llx, size %d, npages %d, page_shift %d, ncont %d, offset %d\n",
-                           ucmd.buf_addr, qp->buf_size, npages, page_shift, ncont, offset);
+               ubuffer->umem = NULL;
        }
 
        *inlen = sizeof(**in) + sizeof(*(*in)->pas) * ncont;
@@ -672,8 +715,9 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
                err = -ENOMEM;
                goto err_umem;
        }
-       if (qp->umem)
-               mlx5_ib_populate_pas(dev, qp->umem, page_shift, (*in)->pas, 0);
+       if (ubuffer->umem)
+               mlx5_ib_populate_pas(dev, ubuffer->umem, page_shift,
+                                    (*in)->pas, 0);
        (*in)->ctx.log_pg_sz_remote_qpn =
                cpu_to_be32((page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24);
        (*in)->ctx.params2 = cpu_to_be32(offset << 6);
@@ -704,29 +748,31 @@ err_free:
        kvfree(*in);
 
 err_umem:
-       if (qp->umem)
-               ib_umem_release(qp->umem);
+       if (ubuffer->umem)
+               ib_umem_release(ubuffer->umem);
 
 err_uuar:
        free_uuar(&context->uuari, uuarn);
        return err;
 }
 
-static void destroy_qp_user(struct ib_pd *pd, struct mlx5_ib_qp *qp)
+static void destroy_qp_user(struct ib_pd *pd, struct mlx5_ib_qp *qp,
+                           struct mlx5_ib_qp_base *base)
 {
        struct mlx5_ib_ucontext *context;
 
        context = to_mucontext(pd->uobject->context);
        mlx5_ib_db_unmap_user(context, &qp->db);
-       if (qp->umem)
-               ib_umem_release(qp->umem);
+       if (base->ubuffer.umem)
+               ib_umem_release(base->ubuffer.umem);
        free_uuar(&context->uuari, qp->uuarn);
 }
 
 static int create_kernel_qp(struct mlx5_ib_dev *dev,
                            struct ib_qp_init_attr *init_attr,
                            struct mlx5_ib_qp *qp,
-                           struct mlx5_create_qp_mbox_in **in, int *inlen)
+                           struct mlx5_create_qp_mbox_in **in, int *inlen,
+                           struct mlx5_ib_qp_base *base)
 {
        enum mlx5_ib_latency_class lc = MLX5_IB_LATENCY_CLASS_LOW;
        struct mlx5_uuar_info *uuari;
@@ -758,9 +804,9 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev,
 
        qp->rq.offset = 0;
        qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
-       qp->buf_size = err + (qp->rq.wqe_cnt << qp->rq.wqe_shift);
+       base->ubuffer.buf_size = err + (qp->rq.wqe_cnt << qp->rq.wqe_shift);
 
-       err = mlx5_buf_alloc(dev->mdev, qp->buf_size, &qp->buf);
+       err = mlx5_buf_alloc(dev->mdev, base->ubuffer.buf_size, &qp->buf);
        if (err) {
                mlx5_ib_dbg(dev, "err %d\n", err);
                goto err_uuar;
@@ -853,19 +899,304 @@ static int is_connected(enum ib_qp_type qp_type)
        return 0;
 }
 
+static int create_raw_packet_qp_tis(struct mlx5_ib_dev *dev,
+                                   struct mlx5_ib_sq *sq, u32 tdn)
+{
+       u32 in[MLX5_ST_SZ_DW(create_tis_in)];
+       void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
+
+       memset(in, 0, sizeof(in));
+
+       MLX5_SET(tisc, tisc, transport_domain, tdn);
+
+       return mlx5_core_create_tis(dev->mdev, in, sizeof(in), &sq->tisn);
+}
+
+static void destroy_raw_packet_qp_tis(struct mlx5_ib_dev *dev,
+                                     struct mlx5_ib_sq *sq)
+{
+       mlx5_core_destroy_tis(dev->mdev, sq->tisn);
+}
+
+static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev,
+                                  struct mlx5_ib_sq *sq, void *qpin,
+                                  struct ib_pd *pd)
+{
+       struct mlx5_ib_ubuffer *ubuffer = &sq->ubuffer;
+       __be64 *pas;
+       void *in;
+       void *sqc;
+       void *qpc = MLX5_ADDR_OF(create_qp_in, qpin, qpc);
+       void *wq;
+       int inlen;
+       int err;
+       int page_shift = 0;
+       int npages;
+       int ncont = 0;
+       u32 offset = 0;
+
+       err = mlx5_ib_umem_get(dev, pd, ubuffer->buf_addr, ubuffer->buf_size,
+                              &sq->ubuffer.umem, &npages, &page_shift,
+                              &ncont, &offset);
+       if (err)
+               return err;
+
+       inlen = MLX5_ST_SZ_BYTES(create_sq_in) + sizeof(u64) * ncont;
+       in = mlx5_vzalloc(inlen);
+       if (!in) {
+               err = -ENOMEM;
+               goto err_umem;
+       }
+
+       sqc = MLX5_ADDR_OF(create_sq_in, in, ctx);
+       MLX5_SET(sqc, sqc, flush_in_error_en, 1);
+       MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RST);
+       MLX5_SET(sqc, sqc, user_index, MLX5_GET(qpc, qpc, user_index));
+       MLX5_SET(sqc, sqc, cqn, MLX5_GET(qpc, qpc, cqn_snd));
+       MLX5_SET(sqc, sqc, tis_lst_sz, 1);
+       MLX5_SET(sqc, sqc, tis_num_0, sq->tisn);
+
+       wq = MLX5_ADDR_OF(sqc, sqc, wq);
+       MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC);
+       MLX5_SET(wq, wq, pd, MLX5_GET(qpc, qpc, pd));
+       MLX5_SET(wq, wq, uar_page, MLX5_GET(qpc, qpc, uar_page));
+       MLX5_SET64(wq, wq, dbr_addr, MLX5_GET64(qpc, qpc, dbr_addr));
+       MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
+       MLX5_SET(wq, wq, log_wq_sz, MLX5_GET(qpc, qpc, log_sq_size));
+       MLX5_SET(wq, wq, log_wq_pg_sz,  page_shift - MLX5_ADAPTER_PAGE_SHIFT);
+       MLX5_SET(wq, wq, page_offset, offset);
+
+       pas = (__be64 *)MLX5_ADDR_OF(wq, wq, pas);
+       mlx5_ib_populate_pas(dev, sq->ubuffer.umem, page_shift, pas, 0);
+
+       err = mlx5_core_create_sq_tracked(dev->mdev, in, inlen, &sq->base.mqp);
+
+       kvfree(in);
+
+       if (err)
+               goto err_umem;
+
+       return 0;
+
+err_umem:
+       ib_umem_release(sq->ubuffer.umem);
+       sq->ubuffer.umem = NULL;
+
+       return err;
+}
+
+static void destroy_raw_packet_qp_sq(struct mlx5_ib_dev *dev,
+                                    struct mlx5_ib_sq *sq)
+{
+       mlx5_core_destroy_sq_tracked(dev->mdev, &sq->base.mqp);
+       ib_umem_release(sq->ubuffer.umem);
+}
+
+static int get_rq_pas_size(void *qpc)
+{
+       u32 log_page_size = MLX5_GET(qpc, qpc, log_page_size) + 12;
+       u32 log_rq_stride = MLX5_GET(qpc, qpc, log_rq_stride);
+       u32 log_rq_size   = MLX5_GET(qpc, qpc, log_rq_size);
+       u32 page_offset   = MLX5_GET(qpc, qpc, page_offset);
+       u32 po_quanta     = 1 << (log_page_size - 6);
+       u32 rq_sz         = 1 << (log_rq_size + 4 + log_rq_stride);
+       u32 page_size     = 1 << log_page_size;
+       u32 rq_sz_po      = rq_sz + (page_offset * po_quanta);
+       u32 rq_num_pas    = (rq_sz_po + page_size - 1) / page_size;
+
+       return rq_num_pas * sizeof(u64);
+}
+
+static int create_raw_packet_qp_rq(struct mlx5_ib_dev *dev,
+                                  struct mlx5_ib_rq *rq, void *qpin)
+{
+       __be64 *pas;
+       __be64 *qp_pas;
+       void *in;
+       void *rqc;
+       void *wq;
+       void *qpc = MLX5_ADDR_OF(create_qp_in, qpin, qpc);
+       int inlen;
+       int err;
+       u32 rq_pas_size = get_rq_pas_size(qpc);
+
+       inlen = MLX5_ST_SZ_BYTES(create_rq_in) + rq_pas_size;
+       in = mlx5_vzalloc(inlen);
+       if (!in)
+               return -ENOMEM;
+
+       rqc = MLX5_ADDR_OF(create_rq_in, in, ctx);
+       MLX5_SET(rqc, rqc, vsd, 1);
+       MLX5_SET(rqc, rqc, mem_rq_type, MLX5_RQC_MEM_RQ_TYPE_MEMORY_RQ_INLINE);
+       MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST);
+       MLX5_SET(rqc, rqc, flush_in_error_en, 1);
+       MLX5_SET(rqc, rqc, user_index, MLX5_GET(qpc, qpc, user_index));
+       MLX5_SET(rqc, rqc, cqn, MLX5_GET(qpc, qpc, cqn_rcv));
+
+       wq = MLX5_ADDR_OF(rqc, rqc, wq);
+       MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC);
+       MLX5_SET(wq, wq, end_padding_mode,
+                MLX5_GET64(qpc, qpc, end_padding_mode));
+       MLX5_SET(wq, wq, page_offset, MLX5_GET(qpc, qpc, page_offset));
+       MLX5_SET(wq, wq, pd, MLX5_GET(qpc, qpc, pd));
+       MLX5_SET64(wq, wq, dbr_addr, MLX5_GET64(qpc, qpc, dbr_addr));
+       MLX5_SET(wq, wq, log_wq_stride, MLX5_GET(qpc, qpc, log_rq_stride) + 4);
+       MLX5_SET(wq, wq, log_wq_pg_sz, MLX5_GET(qpc, qpc, log_page_size));
+       MLX5_SET(wq, wq, log_wq_sz, MLX5_GET(qpc, qpc, log_rq_size));
+
+       pas = (__be64 *)MLX5_ADDR_OF(wq, wq, pas);
+       qp_pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, qpin, pas);
+       memcpy(pas, qp_pas, rq_pas_size);
+
+       err = mlx5_core_create_rq_tracked(dev->mdev, in, inlen, &rq->base.mqp);
+
+       kvfree(in);
+
+       return err;
+}
+
+static void destroy_raw_packet_qp_rq(struct mlx5_ib_dev *dev,
+                                    struct mlx5_ib_rq *rq)
+{
+       mlx5_core_destroy_rq_tracked(dev->mdev, &rq->base.mqp);
+}
+
+static int create_raw_packet_qp_tir(struct mlx5_ib_dev *dev,
+                                   struct mlx5_ib_rq *rq, u32 tdn)
+{
+       u32 *in;
+       void *tirc;
+       int inlen;
+       int err;
+
+       inlen = MLX5_ST_SZ_BYTES(create_tir_in);
+       in = mlx5_vzalloc(inlen);
+       if (!in)
+               return -ENOMEM;
+
+       tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
+       MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_DIRECT);
+       MLX5_SET(tirc, tirc, inline_rqn, rq->base.mqp.qpn);
+       MLX5_SET(tirc, tirc, transport_domain, tdn);
+
+       err = mlx5_core_create_tir(dev->mdev, in, inlen, &rq->tirn);
+
+       kvfree(in);
+
+       return err;
+}
+
+static void destroy_raw_packet_qp_tir(struct mlx5_ib_dev *dev,
+                                     struct mlx5_ib_rq *rq)
+{
+       mlx5_core_destroy_tir(dev->mdev, rq->tirn);
+}
+
+static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
+                               struct mlx5_create_qp_mbox_in *in,
+                               struct ib_pd *pd)
+{
+       struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp;
+       struct mlx5_ib_sq *sq = &raw_packet_qp->sq;
+       struct mlx5_ib_rq *rq = &raw_packet_qp->rq;
+       struct ib_uobject *uobj = pd->uobject;
+       struct ib_ucontext *ucontext = uobj->context;
+       struct mlx5_ib_ucontext *mucontext = to_mucontext(ucontext);
+       int err;
+       u32 tdn = mucontext->tdn;
+
+       if (qp->sq.wqe_cnt) {
+               err = create_raw_packet_qp_tis(dev, sq, tdn);
+               if (err)
+                       return err;
+
+               err = create_raw_packet_qp_sq(dev, sq, in, pd);
+               if (err)
+                       goto err_destroy_tis;
+
+               sq->base.container_mibqp = qp;
+       }
+
+       if (qp->rq.wqe_cnt) {
+               err = create_raw_packet_qp_rq(dev, rq, in);
+               if (err)
+                       goto err_destroy_sq;
+
+               rq->base.container_mibqp = qp;
+
+               err = create_raw_packet_qp_tir(dev, rq, tdn);
+               if (err)
+                       goto err_destroy_rq;
+       }
+
+       qp->trans_qp.base.mqp.qpn = qp->sq.wqe_cnt ? sq->base.mqp.qpn :
+                                                    rq->base.mqp.qpn;
+
+       return 0;
+
+err_destroy_rq:
+       destroy_raw_packet_qp_rq(dev, rq);
+err_destroy_sq:
+       if (!qp->sq.wqe_cnt)
+               return err;
+       destroy_raw_packet_qp_sq(dev, sq);
+err_destroy_tis:
+       destroy_raw_packet_qp_tis(dev, sq);
+
+       return err;
+}
+
+static void destroy_raw_packet_qp(struct mlx5_ib_dev *dev,
+                                 struct mlx5_ib_qp *qp)
+{
+       struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp;
+       struct mlx5_ib_sq *sq = &raw_packet_qp->sq;
+       struct mlx5_ib_rq *rq = &raw_packet_qp->rq;
+
+       if (qp->rq.wqe_cnt) {
+               destroy_raw_packet_qp_tir(dev, rq);
+               destroy_raw_packet_qp_rq(dev, rq);
+       }
+
+       if (qp->sq.wqe_cnt) {
+               destroy_raw_packet_qp_sq(dev, sq);
+               destroy_raw_packet_qp_tis(dev, sq);
+       }
+}
+
+static void raw_packet_qp_copy_info(struct mlx5_ib_qp *qp,
+                                   struct mlx5_ib_raw_packet_qp *raw_packet_qp)
+{
+       struct mlx5_ib_sq *sq = &raw_packet_qp->sq;
+       struct mlx5_ib_rq *rq = &raw_packet_qp->rq;
+
+       sq->sq = &qp->sq;
+       rq->rq = &qp->rq;
+       sq->doorbell = &qp->db;
+       rq->doorbell = &qp->db;
+}
+
 static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
                            struct ib_qp_init_attr *init_attr,
                            struct ib_udata *udata, struct mlx5_ib_qp *qp)
 {
        struct mlx5_ib_resources *devr = &dev->devr;
        struct mlx5_core_dev *mdev = dev->mdev;
+       struct mlx5_ib_qp_base *base;
        struct mlx5_ib_create_qp_resp resp;
        struct mlx5_create_qp_mbox_in *in;
        struct mlx5_ib_create_qp ucmd;
        int inlen = sizeof(*in);
        int err;
+       u32 uidx = MLX5_IB_DEFAULT_UIDX;
+       void *qpc;
+
+       base = init_attr->qp_type == IB_QPT_RAW_PACKET ?
+              &qp->raw_packet_qp.rq.base :
+              &qp->trans_qp.base;
 
-       mlx5_ib_odp_create_qp(qp);
+       if (init_attr->qp_type != IB_QPT_RAW_PACKET)
+               mlx5_ib_odp_create_qp(qp);
 
        mutex_init(&qp->mutex);
        spin_lock_init(&qp->sq.lock);
@@ -880,6 +1211,21 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
                }
        }
 
+       if (init_attr->create_flags &
+                       (IB_QP_CREATE_CROSS_CHANNEL |
+                        IB_QP_CREATE_MANAGED_SEND |
+                        IB_QP_CREATE_MANAGED_RECV)) {
+               if (!MLX5_CAP_GEN(mdev, cd)) {
+                       mlx5_ib_dbg(dev, "cross-channel isn't supported\n");
+                       return -EINVAL;
+               }
+               if (init_attr->create_flags & IB_QP_CREATE_CROSS_CHANNEL)
+                       qp->flags |= MLX5_IB_QP_CROSS_CHANNEL;
+               if (init_attr->create_flags & IB_QP_CREATE_MANAGED_SEND)
+                       qp->flags |= MLX5_IB_QP_MANAGED_SEND;
+               if (init_attr->create_flags & IB_QP_CREATE_MANAGED_RECV)
+                       qp->flags |= MLX5_IB_QP_MANAGED_RECV;
+       }
        if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
                qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE;
 
@@ -889,6 +1235,11 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
                        return -EFAULT;
                }
 
+               err = get_qp_user_index(to_mucontext(pd->uobject->context),
+                                       &ucmd, udata->inlen, &uidx);
+               if (err)
+                       return err;
+
                qp->wq_sig = !!(ucmd.flags & MLX5_QP_FLAG_SIGNATURE);
                qp->scat_cqe = !!(ucmd.flags & MLX5_QP_FLAG_SCATTER_CQE);
        } else {
@@ -918,11 +1269,13 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
                                            ucmd.sq_wqe_count, max_wqes);
                                return -EINVAL;
                        }
-                       err = create_user_qp(dev, pd, qp, udata, &in, &resp, &inlen);
+                       err = create_user_qp(dev, pd, qp, udata, init_attr, &in,
+                                            &resp, &inlen, base);
                        if (err)
                                mlx5_ib_dbg(dev, "err %d\n", err);
                } else {
-                       err = create_kernel_qp(dev, init_attr, qp, &in, &inlen);
+                       err = create_kernel_qp(dev, init_attr, qp, &in, &inlen,
+                                              base);
                        if (err)
                                mlx5_ib_dbg(dev, "err %d\n", err);
                }
@@ -954,6 +1307,13 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
        if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK)
                in->ctx.flags_pd |= cpu_to_be32(MLX5_QP_BLOCK_MCAST);
 
+       if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL)
+               in->ctx.params2 |= cpu_to_be32(MLX5_QP_BIT_CC_MASTER);
+       if (qp->flags & MLX5_IB_QP_MANAGED_SEND)
+               in->ctx.params2 |= cpu_to_be32(MLX5_QP_BIT_CC_SLAVE_SEND);
+       if (qp->flags & MLX5_IB_QP_MANAGED_RECV)
+               in->ctx.params2 |= cpu_to_be32(MLX5_QP_BIT_CC_SLAVE_RECV);
+
        if (qp->scat_cqe && is_connected(init_attr->qp_type)) {
                int rcqe_sz;
                int scqe_sz;
@@ -1018,26 +1378,35 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 
        in->ctx.db_rec_addr = cpu_to_be64(qp->db.dma);
 
-       err = mlx5_core_create_qp(dev->mdev, &qp->mqp, in, inlen);
+       if (MLX5_CAP_GEN(mdev, cqe_version) == MLX5_CQE_VERSION_V1) {
+               qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
+               /* 0xffffff means we ask to work with cqe version 0 */
+               MLX5_SET(qpc, qpc, user_index, uidx);
+       }
+
+       if (init_attr->qp_type == IB_QPT_RAW_PACKET) {
+               qp->raw_packet_qp.sq.ubuffer.buf_addr = ucmd.sq_buf_addr;
+               raw_packet_qp_copy_info(qp, &qp->raw_packet_qp);
+               err = create_raw_packet_qp(dev, qp, in, pd);
+       } else {
+               err = mlx5_core_create_qp(dev->mdev, &base->mqp, in, inlen);
+       }
+
        if (err) {
                mlx5_ib_dbg(dev, "create qp failed\n");
                goto err_create;
        }
 
        kvfree(in);
-       /* Hardware wants QPN written in big-endian order (after
-        * shifting) for send doorbell.  Precompute this value to save
-        * a little bit when posting sends.
-        */
-       qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
 
-       qp->mqp.event = mlx5_ib_qp_event;
+       base->container_mibqp = qp;
+       base->mqp.event = mlx5_ib_qp_event;
 
        return 0;
 
 err_create:
        if (qp->create_type == MLX5_QP_USER)
-               destroy_qp_user(pd, qp);
+               destroy_qp_user(pd, qp, base);
        else if (qp->create_type == MLX5_QP_KERNEL)
                destroy_qp_kernel(dev, qp);
 
@@ -1129,11 +1498,11 @@ static void get_cqs(struct mlx5_ib_qp *qp,
        case IB_QPT_UD:
        case IB_QPT_RAW_IPV6:
        case IB_QPT_RAW_ETHERTYPE:
+       case IB_QPT_RAW_PACKET:
                *send_cq = to_mcq(qp->ibqp.send_cq);
                *recv_cq = to_mcq(qp->ibqp.recv_cq);
                break;
 
-       case IB_QPT_RAW_PACKET:
        case IB_QPT_MAX:
        default:
                *send_cq = NULL;
@@ -1142,45 +1511,66 @@ static void get_cqs(struct mlx5_ib_qp *qp,
        }
 }
 
+static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
+                               u16 operation);
+
 static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
 {
        struct mlx5_ib_cq *send_cq, *recv_cq;
+       struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
        struct mlx5_modify_qp_mbox_in *in;
        int err;
 
+       base = qp->ibqp.qp_type == IB_QPT_RAW_PACKET ?
+              &qp->raw_packet_qp.rq.base :
+              &qp->trans_qp.base;
+
        in = kzalloc(sizeof(*in), GFP_KERNEL);
        if (!in)
                return;
 
        if (qp->state != IB_QPS_RESET) {
-               mlx5_ib_qp_disable_pagefaults(qp);
-               if (mlx5_core_qp_modify(dev->mdev, to_mlx5_state(qp->state),
-                                       MLX5_QP_STATE_RST, in, 0, &qp->mqp))
-                       mlx5_ib_warn(dev, "mlx5_ib: modify QP %06x to RESET failed\n",
-                                    qp->mqp.qpn);
+               if (qp->ibqp.qp_type != IB_QPT_RAW_PACKET) {
+                       mlx5_ib_qp_disable_pagefaults(qp);
+                       err = mlx5_core_qp_modify(dev->mdev,
+                                                 MLX5_CMD_OP_2RST_QP, in, 0,
+                                                 &base->mqp);
+               } else {
+                       err = modify_raw_packet_qp(dev, qp,
+                                                  MLX5_CMD_OP_2RST_QP);
+               }
+               if (err)
+                       mlx5_ib_warn(dev, "mlx5_ib: modify QP 0x%06x to RESET failed\n",
+                                    base->mqp.qpn);
        }
 
        get_cqs(qp, &send_cq, &recv_cq);
 
        if (qp->create_type == MLX5_QP_KERNEL) {
                mlx5_ib_lock_cqs(send_cq, recv_cq);
-               __mlx5_ib_cq_clean(recv_cq, qp->mqp.qpn,
+               __mlx5_ib_cq_clean(recv_cq, base->mqp.qpn,
                                   qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL);
                if (send_cq != recv_cq)
-                       __mlx5_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
+                       __mlx5_ib_cq_clean(send_cq, base->mqp.qpn,
+                                          NULL);
                mlx5_ib_unlock_cqs(send_cq, recv_cq);
        }
 
-       err = mlx5_core_destroy_qp(dev->mdev, &qp->mqp);
-       if (err)
-               mlx5_ib_warn(dev, "failed to destroy QP 0x%x\n", qp->mqp.qpn);
-       kfree(in);
+       if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) {
+               destroy_raw_packet_qp(dev, qp);
+       } else {
+               err = mlx5_core_destroy_qp(dev->mdev, &base->mqp);
+               if (err)
+                       mlx5_ib_warn(dev, "failed to destroy QP 0x%x\n",
+                                    base->mqp.qpn);
+       }
 
+       kfree(in);
 
        if (qp->create_type == MLX5_QP_KERNEL)
                destroy_qp_kernel(dev, qp);
        else if (qp->create_type == MLX5_QP_USER)
-               destroy_qp_user(&get_pd(qp)->ibpd, qp);
+               destroy_qp_user(&get_pd(qp)->ibpd, qp, base);
 }
 
 static const char *ib_qp_type_str(enum ib_qp_type type)
@@ -1234,6 +1624,16 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd,
                        return ERR_PTR(-EINVAL);
                }
                dev = to_mdev(to_mxrcd(init_attr->xrcd)->ibxrcd.device);
+
+               if (init_attr->qp_type == IB_QPT_RAW_PACKET) {
+                       if (!pd->uobject) {
+                               mlx5_ib_dbg(dev, "Raw Packet QP is not supported for kernel consumers\n");
+                               return ERR_PTR(-EINVAL);
+                       } else if (!to_mucontext(pd->uobject->context)->cqe_version) {
+                               mlx5_ib_dbg(dev, "Raw Packet QP is only supported for CQE version > 0\n");
+                               return ERR_PTR(-EINVAL);
+                       }
+               }
        }
 
        switch (init_attr->qp_type) {
@@ -1250,6 +1650,7 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd,
                }
 
                /* fall through */
+       case IB_QPT_RAW_PACKET:
        case IB_QPT_RC:
        case IB_QPT_UC:
        case IB_QPT_UD:
@@ -1272,19 +1673,19 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd,
                else if (is_qp1(init_attr->qp_type))
                        qp->ibqp.qp_num = 1;
                else
-                       qp->ibqp.qp_num = qp->mqp.qpn;
+                       qp->ibqp.qp_num = qp->trans_qp.base.mqp.qpn;
 
                mlx5_ib_dbg(dev, "ib qpnum 0x%x, mlx qpn 0x%x, rcqn 0x%x, scqn 0x%x\n",
-                           qp->ibqp.qp_num, qp->mqp.qpn, to_mcq(init_attr->recv_cq)->mcq.cqn,
+                           qp->ibqp.qp_num, qp->trans_qp.base.mqp.qpn,
+                           to_mcq(init_attr->recv_cq)->mcq.cqn,
                            to_mcq(init_attr->send_cq)->mcq.cqn);
 
-               qp->xrcdn = xrcdn;
+               qp->trans_qp.xrcdn = xrcdn;
 
                break;
 
        case IB_QPT_RAW_IPV6:
        case IB_QPT_RAW_ETHERTYPE:
-       case IB_QPT_RAW_PACKET:
        case IB_QPT_MAX:
        default:
                mlx5_ib_dbg(dev, "unsupported qp type %d\n",
@@ -1318,12 +1719,12 @@ static __be32 to_mlx5_access_flags(struct mlx5_ib_qp *qp, const struct ib_qp_att
        if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
                dest_rd_atomic = attr->max_dest_rd_atomic;
        else
-               dest_rd_atomic = qp->resp_depth;
+               dest_rd_atomic = qp->trans_qp.resp_depth;
 
        if (attr_mask & IB_QP_ACCESS_FLAGS)
                access_flags = attr->qp_access_flags;
        else
-               access_flags = qp->atomic_rd_en;
+               access_flags = qp->trans_qp.atomic_rd_en;
 
        if (!dest_rd_atomic)
                access_flags &= IB_ACCESS_REMOTE_WRITE;
@@ -1360,21 +1761,42 @@ static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate)
        return rate + MLX5_STAT_RATE_OFFSET;
 }
 
-static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah,
+static int modify_raw_packet_eth_prio(struct mlx5_core_dev *dev,
+                                     struct mlx5_ib_sq *sq, u8 sl)
+{
+       void *in;
+       void *tisc;
+       int inlen;
+       int err;
+
+       inlen = MLX5_ST_SZ_BYTES(modify_tis_in);
+       in = mlx5_vzalloc(inlen);
+       if (!in)
+               return -ENOMEM;
+
+       MLX5_SET(modify_tis_in, in, bitmask.prio, 1);
+
+       tisc = MLX5_ADDR_OF(modify_tis_in, in, ctx);
+       MLX5_SET(tisc, tisc, prio, ((sl & 0x7) << 1));
+
+       err = mlx5_core_modify_tis(dev, sq->tisn, in, inlen);
+
+       kvfree(in);
+
+       return err;
+}
+
+static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
+                        const struct ib_ah_attr *ah,
                         struct mlx5_qp_path *path, u8 port, int attr_mask,
                         u32 path_flags, const struct ib_qp_attr *attr)
 {
+       enum rdma_link_layer ll = rdma_port_get_link_layer(&dev->ib_dev, port);
        int err;
 
-       path->fl = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0;
-       path->free_ar = (path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x80 : 0;
-
        if (attr_mask & IB_QP_PKEY_INDEX)
                path->pkey_index = attr->pkey_index;
 
-       path->grh_mlid  = ah->src_path_bits & 0x7f;
-       path->rlid      = cpu_to_be16(ah->dlid);
-
        if (ah->ah_flags & IB_AH_GRH) {
                if (ah->grh.sgid_index >=
                    dev->mdev->port_caps[port - 1].gid_table_len) {
@@ -1383,7 +1805,27 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah,
                               dev->mdev->port_caps[port - 1].gid_table_len);
                        return -EINVAL;
                }
-               path->grh_mlid |= 1 << 7;
+       }
+
+       if (ll == IB_LINK_LAYER_ETHERNET) {
+               if (!(ah->ah_flags & IB_AH_GRH))
+                       return -EINVAL;
+               memcpy(path->rmac, ah->dmac, sizeof(ah->dmac));
+               path->udp_sport = mlx5_get_roce_udp_sport(dev, port,
+                                                         ah->grh.sgid_index);
+               path->dci_cfi_prio_sl = (ah->sl & 0x7) << 4;
+       } else {
+               path->fl = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0;
+               path->free_ar = (path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x80 :
+                                                                       0;
+               path->rlid = cpu_to_be16(ah->dlid);
+               path->grh_mlid = ah->src_path_bits & 0x7f;
+               if (ah->ah_flags & IB_AH_GRH)
+                       path->grh_mlid  |= 1 << 7;
+               path->dci_cfi_prio_sl = ah->sl & 0xf;
+       }
+
+       if (ah->ah_flags & IB_AH_GRH) {
                path->mgid_index = ah->grh.sgid_index;
                path->hop_limit  = ah->grh.hop_limit;
                path->tclass_flowlabel =
@@ -1401,7 +1843,10 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah,
        if (attr_mask & IB_QP_TIMEOUT)
                path->ackto_lt = attr->timeout << 3;
 
-       path->sl = ah->sl & 0xf;
+       if ((qp->ibqp.qp_type == IB_QPT_RAW_PACKET) && qp->sq.wqe_cnt)
+               return modify_raw_packet_eth_prio(dev->mdev,
+                                                 &qp->raw_packet_qp.sq,
+                                                 ah->sl & 0xf);
 
        return 0;
 }
@@ -1549,12 +1994,154 @@ static int ib_mask_to_mlx5_opt(int ib_mask)
        return result;
 }
 
+static int modify_raw_packet_qp_rq(struct mlx5_core_dev *dev,
+                                  struct mlx5_ib_rq *rq, int new_state)
+{
+       void *in;
+       void *rqc;
+       int inlen;
+       int err;
+
+       inlen = MLX5_ST_SZ_BYTES(modify_rq_in);
+       in = mlx5_vzalloc(inlen);
+       if (!in)
+               return -ENOMEM;
+
+       MLX5_SET(modify_rq_in, in, rq_state, rq->state);
+
+       rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx);
+       MLX5_SET(rqc, rqc, state, new_state);
+
+       err = mlx5_core_modify_rq(dev, rq->base.mqp.qpn, in, inlen);
+       if (err)
+               goto out;
+
+       rq->state = new_state;
+
+out:
+       kvfree(in);
+       return err;
+}
+
+static int modify_raw_packet_qp_sq(struct mlx5_core_dev *dev,
+                                  struct mlx5_ib_sq *sq, int new_state)
+{
+       void *in;
+       void *sqc;
+       int inlen;
+       int err;
+
+       inlen = MLX5_ST_SZ_BYTES(modify_sq_in);
+       in = mlx5_vzalloc(inlen);
+       if (!in)
+               return -ENOMEM;
+
+       MLX5_SET(modify_sq_in, in, sq_state, sq->state);
+
+       sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx);
+       MLX5_SET(sqc, sqc, state, new_state);
+
+       err = mlx5_core_modify_sq(dev, sq->base.mqp.qpn, in, inlen);
+       if (err)
+               goto out;
+
+       sq->state = new_state;
+
+out:
+       kvfree(in);
+       return err;
+}
+
+static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
+                               u16 operation)
+{
+       struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp;
+       struct mlx5_ib_rq *rq = &raw_packet_qp->rq;
+       struct mlx5_ib_sq *sq = &raw_packet_qp->sq;
+       int rq_state;
+       int sq_state;
+       int err;
+
+       switch (operation) {
+       case MLX5_CMD_OP_RST2INIT_QP:
+               rq_state = MLX5_RQC_STATE_RDY;
+               sq_state = MLX5_SQC_STATE_RDY;
+               break;
+       case MLX5_CMD_OP_2ERR_QP:
+               rq_state = MLX5_RQC_STATE_ERR;
+               sq_state = MLX5_SQC_STATE_ERR;
+               break;
+       case MLX5_CMD_OP_2RST_QP:
+               rq_state = MLX5_RQC_STATE_RST;
+               sq_state = MLX5_SQC_STATE_RST;
+               break;
+       case MLX5_CMD_OP_INIT2INIT_QP:
+       case MLX5_CMD_OP_INIT2RTR_QP:
+       case MLX5_CMD_OP_RTR2RTS_QP:
+       case MLX5_CMD_OP_RTS2RTS_QP:
+               /* Nothing to do here... */
+               return 0;
+       default:
+               WARN_ON(1);
+               return -EINVAL;
+       }
+
+       if (qp->rq.wqe_cnt) {
+               err =  modify_raw_packet_qp_rq(dev->mdev, rq, rq_state);
+               if (err)
+                       return err;
+       }
+
+       if (qp->sq.wqe_cnt)
+               return modify_raw_packet_qp_sq(dev->mdev, sq, sq_state);
+
+       return 0;
+}
+
 static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
                               const struct ib_qp_attr *attr, int attr_mask,
                               enum ib_qp_state cur_state, enum ib_qp_state new_state)
 {
+       static const u16 optab[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE] = {
+               [MLX5_QP_STATE_RST] = {
+                       [MLX5_QP_STATE_RST]     = MLX5_CMD_OP_2RST_QP,
+                       [MLX5_QP_STATE_ERR]     = MLX5_CMD_OP_2ERR_QP,
+                       [MLX5_QP_STATE_INIT]    = MLX5_CMD_OP_RST2INIT_QP,
+               },
+               [MLX5_QP_STATE_INIT]  = {
+                       [MLX5_QP_STATE_RST]     = MLX5_CMD_OP_2RST_QP,
+                       [MLX5_QP_STATE_ERR]     = MLX5_CMD_OP_2ERR_QP,
+                       [MLX5_QP_STATE_INIT]    = MLX5_CMD_OP_INIT2INIT_QP,
+                       [MLX5_QP_STATE_RTR]     = MLX5_CMD_OP_INIT2RTR_QP,
+               },
+               [MLX5_QP_STATE_RTR]   = {
+                       [MLX5_QP_STATE_RST]     = MLX5_CMD_OP_2RST_QP,
+                       [MLX5_QP_STATE_ERR]     = MLX5_CMD_OP_2ERR_QP,
+                       [MLX5_QP_STATE_RTS]     = MLX5_CMD_OP_RTR2RTS_QP,
+               },
+               [MLX5_QP_STATE_RTS]   = {
+                       [MLX5_QP_STATE_RST]     = MLX5_CMD_OP_2RST_QP,
+                       [MLX5_QP_STATE_ERR]     = MLX5_CMD_OP_2ERR_QP,
+                       [MLX5_QP_STATE_RTS]     = MLX5_CMD_OP_RTS2RTS_QP,
+               },
+               [MLX5_QP_STATE_SQD] = {
+                       [MLX5_QP_STATE_RST]     = MLX5_CMD_OP_2RST_QP,
+                       [MLX5_QP_STATE_ERR]     = MLX5_CMD_OP_2ERR_QP,
+               },
+               [MLX5_QP_STATE_SQER] = {
+                       [MLX5_QP_STATE_RST]     = MLX5_CMD_OP_2RST_QP,
+                       [MLX5_QP_STATE_ERR]     = MLX5_CMD_OP_2ERR_QP,
+                       [MLX5_QP_STATE_RTS]     = MLX5_CMD_OP_SQERR2RTS_QP,
+               },
+               [MLX5_QP_STATE_ERR] = {
+                       [MLX5_QP_STATE_RST]     = MLX5_CMD_OP_2RST_QP,
+                       [MLX5_QP_STATE_ERR]     = MLX5_CMD_OP_2ERR_QP,
+               }
+       };
+
        struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
        struct mlx5_ib_qp *qp = to_mqp(ibqp);
+       struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
        struct mlx5_ib_cq *send_cq, *recv_cq;
        struct mlx5_qp_context *context;
        struct mlx5_modify_qp_mbox_in *in;
@@ -1564,6 +2151,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
        int sqd_event;
        int mlx5_st;
        int err;
+       u16 op;
 
        in = kzalloc(sizeof(*in), GFP_KERNEL);
        if (!in)
@@ -1623,7 +2211,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
                context->pri_path.port = attr->port_num;
 
        if (attr_mask & IB_QP_AV) {
-               err = mlx5_set_path(dev, &attr->ah_attr, &context->pri_path,
+               err = mlx5_set_path(dev, qp, &attr->ah_attr, &context->pri_path,
                                    attr_mask & IB_QP_PORT ? attr->port_num : qp->port,
                                    attr_mask, 0, attr);
                if (err)
@@ -1634,7 +2222,8 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
                context->pri_path.ackto_lt |= attr->timeout << 3;
 
        if (attr_mask & IB_QP_ALT_PATH) {
-               err = mlx5_set_path(dev, &attr->alt_ah_attr, &context->alt_path,
+               err = mlx5_set_path(dev, qp, &attr->alt_ah_attr,
+                                   &context->alt_path,
                                    attr->alt_port_num, attr_mask, 0, attr);
                if (err)
                        goto out;
@@ -1706,41 +2295,51 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
         * again to RTS, and may cause the driver and the device to get out of
         * sync. */
        if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR &&
-           (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR))
+           (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR) &&
+           (qp->ibqp.qp_type != IB_QPT_RAW_PACKET))
                mlx5_ib_qp_disable_pagefaults(qp);
 
+       if (mlx5_cur >= MLX5_QP_NUM_STATE || mlx5_new >= MLX5_QP_NUM_STATE ||
+           !optab[mlx5_cur][mlx5_new])
+               goto out;
+
+       op = optab[mlx5_cur][mlx5_new];
        optpar = ib_mask_to_mlx5_opt(attr_mask);
        optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st];
        in->optparam = cpu_to_be32(optpar);
-       err = mlx5_core_qp_modify(dev->mdev, to_mlx5_state(cur_state),
-                                 to_mlx5_state(new_state), in, sqd_event,
-                                 &qp->mqp);
+
+       if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET)
+               err = modify_raw_packet_qp(dev, qp, op);
+       else
+               err = mlx5_core_qp_modify(dev->mdev, op, in, sqd_event,
+                                         &base->mqp);
        if (err)
                goto out;
 
-       if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+       if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT &&
+           (qp->ibqp.qp_type != IB_QPT_RAW_PACKET))
                mlx5_ib_qp_enable_pagefaults(qp);
 
        qp->state = new_state;
 
        if (attr_mask & IB_QP_ACCESS_FLAGS)
-               qp->atomic_rd_en = attr->qp_access_flags;
+               qp->trans_qp.atomic_rd_en = attr->qp_access_flags;
        if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
-               qp->resp_depth = attr->max_dest_rd_atomic;
+               qp->trans_qp.resp_depth = attr->max_dest_rd_atomic;
        if (attr_mask & IB_QP_PORT)
                qp->port = attr->port_num;
        if (attr_mask & IB_QP_ALT_PATH)
-               qp->alt_port = attr->alt_port_num;
+               qp->trans_qp.alt_port = attr->alt_port_num;
 
        /*
         * If we moved a kernel QP to RESET, clean up all old CQ
         * entries and reinitialize the QP.
         */
        if (new_state == IB_QPS_RESET && !ibqp->uobject) {
-               mlx5_ib_cq_clean(recv_cq, qp->mqp.qpn,
+               mlx5_ib_cq_clean(recv_cq, base->mqp.qpn,
                                 ibqp->srq ? to_msrq(ibqp->srq) : NULL);
                if (send_cq != recv_cq)
-                       mlx5_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
+                       mlx5_ib_cq_clean(send_cq, base->mqp.qpn, NULL);
 
                qp->rq.head = 0;
                qp->rq.tail = 0;
@@ -1765,15 +2364,21 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
        enum ib_qp_state cur_state, new_state;
        int err = -EINVAL;
        int port;
+       enum rdma_link_layer ll = IB_LINK_LAYER_UNSPECIFIED;
 
        mutex_lock(&qp->mutex);
 
        cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;
        new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
 
+       if (!(cur_state == new_state && cur_state == IB_QPS_RESET)) {
+               port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
+               ll = dev->ib_dev.get_link_layer(&dev->ib_dev, port);
+       }
+
        if (ibqp->qp_type != MLX5_IB_QPT_REG_UMR &&
            !ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask,
-                               IB_LINK_LAYER_UNSPECIFIED))
+                               ll))
                goto out;
 
        if ((attr_mask & IB_QP_PORT) &&
@@ -2570,7 +3175,7 @@ static void finish_wqe(struct mlx5_ib_qp *qp,
 
        ctrl->opmod_idx_opcode = cpu_to_be32(((u32)(qp->sq.cur_post) << 8) |
                                             mlx5_opcode | ((u32)opmod << 24));
-       ctrl->qpn_ds = cpu_to_be32(size | (qp->mqp.qpn << 8));
+       ctrl->qpn_ds = cpu_to_be32(size | (qp->trans_qp.base.mqp.qpn << 8));
        ctrl->fm_ce_se |= fence;
        qp->fm_cache = next_fence;
        if (unlikely(qp->wq_sig))
@@ -3003,7 +3608,7 @@ static void to_ib_ah_attr(struct mlx5_ib_dev *ibdev, struct ib_ah_attr *ib_ah_at
            ib_ah_attr->port_num > MLX5_CAP_GEN(dev, num_ports))
                return;
 
-       ib_ah_attr->sl = path->sl & 0xf;
+       ib_ah_attr->sl = path->dci_cfi_prio_sl & 0xf;
 
        ib_ah_attr->dlid          = be16_to_cpu(path->rlid);
        ib_ah_attr->src_path_bits = path->grh_mlid & 0x7f;
@@ -3021,39 +3626,153 @@ static void to_ib_ah_attr(struct mlx5_ib_dev *ibdev, struct ib_ah_attr *ib_ah_at
        }
 }
 
-int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
-                    struct ib_qp_init_attr *qp_init_attr)
+static int query_raw_packet_qp_sq_state(struct mlx5_ib_dev *dev,
+                                       struct mlx5_ib_sq *sq,
+                                       u8 *sq_state)
+{
+       void *out;
+       void *sqc;
+       int inlen;
+       int err;
+
+       inlen = MLX5_ST_SZ_BYTES(query_sq_out);
+       out = mlx5_vzalloc(inlen);
+       if (!out)
+               return -ENOMEM;
+
+       err = mlx5_core_query_sq(dev->mdev, sq->base.mqp.qpn, out);
+       if (err)
+               goto out;
+
+       sqc = MLX5_ADDR_OF(query_sq_out, out, sq_context);
+       *sq_state = MLX5_GET(sqc, sqc, state);
+       sq->state = *sq_state;
+
+out:
+       kvfree(out);
+       return err;
+}
+
+static int query_raw_packet_qp_rq_state(struct mlx5_ib_dev *dev,
+                                       struct mlx5_ib_rq *rq,
+                                       u8 *rq_state)
+{
+       void *out;
+       void *rqc;
+       int inlen;
+       int err;
+
+       inlen = MLX5_ST_SZ_BYTES(query_rq_out);
+       out = mlx5_vzalloc(inlen);
+       if (!out)
+               return -ENOMEM;
+
+       err = mlx5_core_query_rq(dev->mdev, rq->base.mqp.qpn, out);
+       if (err)
+               goto out;
+
+       rqc = MLX5_ADDR_OF(query_rq_out, out, rq_context);
+       *rq_state = MLX5_GET(rqc, rqc, state);
+       rq->state = *rq_state;
+
+out:
+       kvfree(out);
+       return err;
+}
+
+static int sqrq_state_to_qp_state(u8 sq_state, u8 rq_state,
+                                 struct mlx5_ib_qp *qp, u8 *qp_state)
+{
+       static const u8 sqrq_trans[MLX5_RQ_NUM_STATE][MLX5_SQ_NUM_STATE] = {
+               [MLX5_RQC_STATE_RST] = {
+                       [MLX5_SQC_STATE_RST]    = IB_QPS_RESET,
+                       [MLX5_SQC_STATE_RDY]    = MLX5_QP_STATE_BAD,
+                       [MLX5_SQC_STATE_ERR]    = MLX5_QP_STATE_BAD,
+                       [MLX5_SQ_STATE_NA]      = IB_QPS_RESET,
+               },
+               [MLX5_RQC_STATE_RDY] = {
+                       [MLX5_SQC_STATE_RST]    = MLX5_QP_STATE_BAD,
+                       [MLX5_SQC_STATE_RDY]    = MLX5_QP_STATE,
+                       [MLX5_SQC_STATE_ERR]    = IB_QPS_SQE,
+                       [MLX5_SQ_STATE_NA]      = MLX5_QP_STATE,
+               },
+               [MLX5_RQC_STATE_ERR] = {
+                       [MLX5_SQC_STATE_RST]    = MLX5_QP_STATE_BAD,
+                       [MLX5_SQC_STATE_RDY]    = MLX5_QP_STATE_BAD,
+                       [MLX5_SQC_STATE_ERR]    = IB_QPS_ERR,
+                       [MLX5_SQ_STATE_NA]      = IB_QPS_ERR,
+               },
+               [MLX5_RQ_STATE_NA] = {
+                       [MLX5_SQC_STATE_RST]    = IB_QPS_RESET,
+                       [MLX5_SQC_STATE_RDY]    = MLX5_QP_STATE,
+                       [MLX5_SQC_STATE_ERR]    = MLX5_QP_STATE,
+                       [MLX5_SQ_STATE_NA]      = MLX5_QP_STATE_BAD,
+               },
+       };
+
+       *qp_state = sqrq_trans[rq_state][sq_state];
+
+       if (*qp_state == MLX5_QP_STATE_BAD) {
+               WARN(1, "Buggy Raw Packet QP state, SQ 0x%x state: 0x%x, RQ 0x%x state: 0x%x",
+                    qp->raw_packet_qp.sq.base.mqp.qpn, sq_state,
+                    qp->raw_packet_qp.rq.base.mqp.qpn, rq_state);
+               return -EINVAL;
+       }
+
+       if (*qp_state == MLX5_QP_STATE)
+               *qp_state = qp->state;
+
+       return 0;
+}
+
+static int query_raw_packet_qp_state(struct mlx5_ib_dev *dev,
+                                    struct mlx5_ib_qp *qp,
+                                    u8 *raw_packet_qp_state)
+{
+       struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp;
+       struct mlx5_ib_sq *sq = &raw_packet_qp->sq;
+       struct mlx5_ib_rq *rq = &raw_packet_qp->rq;
+       int err;
+       u8 sq_state = MLX5_SQ_STATE_NA;
+       u8 rq_state = MLX5_RQ_STATE_NA;
+
+       if (qp->sq.wqe_cnt) {
+               err = query_raw_packet_qp_sq_state(dev, sq, &sq_state);
+               if (err)
+                       return err;
+       }
+
+       if (qp->rq.wqe_cnt) {
+               err = query_raw_packet_qp_rq_state(dev, rq, &rq_state);
+               if (err)
+                       return err;
+       }
+
+       return sqrq_state_to_qp_state(sq_state, rq_state, qp,
+                                     raw_packet_qp_state);
+}
+
+static int query_qp_attr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
+                        struct ib_qp_attr *qp_attr)
 {
-       struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
-       struct mlx5_ib_qp *qp = to_mqp(ibqp);
        struct mlx5_query_qp_mbox_out *outb;
        struct mlx5_qp_context *context;
        int mlx5_state;
        int err = 0;
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-       /*
-        * Wait for any outstanding page faults, in case the user frees memory
-        * based upon this query's result.
-        */
-       flush_workqueue(mlx5_ib_page_fault_wq);
-#endif
-
-       mutex_lock(&qp->mutex);
        outb = kzalloc(sizeof(*outb), GFP_KERNEL);
-       if (!outb) {
-               err = -ENOMEM;
-               goto out;
-       }
+       if (!outb)
+               return -ENOMEM;
+
        context = &outb->ctx;
-       err = mlx5_core_qp_query(dev->mdev, &qp->mqp, outb, sizeof(*outb));
+       err = mlx5_core_qp_query(dev->mdev, &qp->trans_qp.base.mqp, outb,
+                                sizeof(*outb));
        if (err)
-               goto out_free;
+               goto out;
 
        mlx5_state = be32_to_cpu(context->flags) >> 28;
 
        qp->state                    = to_ib_qp_state(mlx5_state);
-       qp_attr->qp_state            = qp->state;
        qp_attr->path_mtu            = context->mtu_msgmax >> 5;
        qp_attr->path_mig_state      =
                to_ib_mig_state((be32_to_cpu(context->flags) >> 11) & 0x3);
@@ -3087,6 +3806,43 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr
        qp_attr->retry_cnt          = (be32_to_cpu(context->params1) >> 16) & 0x7;
        qp_attr->rnr_retry          = (be32_to_cpu(context->params1) >> 13) & 0x7;
        qp_attr->alt_timeout        = context->alt_path.ackto_lt >> 3;
+
+out:
+       kfree(outb);
+       return err;
+}
+
+int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+                    int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
+{
+       struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+       struct mlx5_ib_qp *qp = to_mqp(ibqp);
+       int err = 0;
+       u8 raw_packet_qp_state;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+       /*
+        * Wait for any outstanding page faults, in case the user frees memory
+        * based upon this query's result.
+        */
+       flush_workqueue(mlx5_ib_page_fault_wq);
+#endif
+
+       mutex_lock(&qp->mutex);
+
+       if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) {
+               err = query_raw_packet_qp_state(dev, qp, &raw_packet_qp_state);
+               if (err)
+                       goto out;
+               qp->state = raw_packet_qp_state;
+               qp_attr->port_num = 1;
+       } else {
+               err = query_qp_attr(dev, qp, qp_attr);
+               if (err)
+                       goto out;
+       }
+
+       qp_attr->qp_state            = qp->state;
        qp_attr->cur_qp_state        = qp_attr->qp_state;
        qp_attr->cap.max_recv_wr     = qp->rq.wqe_cnt;
        qp_attr->cap.max_recv_sge    = qp->rq.max_gs;
@@ -3110,12 +3866,16 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr
        if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK)
                qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;
 
+       if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL)
+               qp_init_attr->create_flags |= IB_QP_CREATE_CROSS_CHANNEL;
+       if (qp->flags & MLX5_IB_QP_MANAGED_SEND)
+               qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_SEND;
+       if (qp->flags & MLX5_IB_QP_MANAGED_RECV)
+               qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_RECV;
+
        qp_init_attr->sq_sig_type = qp->sq_signal_bits & MLX5_WQE_CTRL_CQ_UPDATE ?
                IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
 
-out_free:
-       kfree(outb);
-
 out:
        mutex_unlock(&qp->mutex);
        return err;
index e008505e96e9778ff9a71cfc021de7ecee6c3331..4659256cd95e698c3a9dad3c69407276e2117f29 100644 (file)
@@ -78,28 +78,41 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
                           struct ib_udata *udata, int buf_size, int *inlen)
 {
        struct mlx5_ib_dev *dev = to_mdev(pd->device);
-       struct mlx5_ib_create_srq ucmd;
+       struct mlx5_ib_create_srq ucmd = {};
        size_t ucmdlen;
+       void *xsrqc;
        int err;
        int npages;
        int page_shift;
        int ncont;
        u32 offset;
+       u32 uidx = MLX5_IB_DEFAULT_UIDX;
+       int drv_data = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr);
 
-       ucmdlen =
-               (udata->inlen - sizeof(struct ib_uverbs_cmd_hdr) <
-                sizeof(ucmd)) ? (sizeof(ucmd) -
-                                 sizeof(ucmd.reserved)) : sizeof(ucmd);
+       if (drv_data < 0)
+               return -EINVAL;
+
+       ucmdlen = (drv_data < sizeof(ucmd)) ?
+                 drv_data : sizeof(ucmd);
 
        if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) {
                mlx5_ib_dbg(dev, "failed copy udata\n");
                return -EFAULT;
        }
 
-       if (ucmdlen == sizeof(ucmd) &&
-           ucmd.reserved != 0)
+       if (ucmd.reserved0 || ucmd.reserved1)
                return -EINVAL;
 
+       if (drv_data > sizeof(ucmd) &&
+           !ib_is_udata_cleared(udata, sizeof(ucmd),
+                                drv_data - sizeof(ucmd)))
+               return -EINVAL;
+
+       err = get_srq_user_index(to_mucontext(pd->uobject->context),
+                                &ucmd, udata->inlen, &uidx);
+       if (err)
+               return err;
+
        srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE);
 
        srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, buf_size,
@@ -138,6 +151,12 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
        (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
        (*in)->ctx.pgoff_cqn = cpu_to_be32(offset << 26);
 
+       if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1) {
+               xsrqc = MLX5_ADDR_OF(create_xrc_srq_in, *in,
+                                    xrc_srq_context_entry);
+               MLX5_SET(xrc_srqc, xsrqc, user_index, uidx);
+       }
+
        return 0;
 
 err_in:
@@ -158,6 +177,7 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq,
        struct mlx5_wqe_srq_next_seg *next;
        int page_shift;
        int npages;
+       void *xsrqc;
 
        err = mlx5_db_alloc(dev->mdev, &srq->db);
        if (err) {
@@ -204,6 +224,13 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq,
 
        (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
 
+       if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1) {
+               xsrqc = MLX5_ADDR_OF(create_xrc_srq_in, *in,
+                                    xrc_srq_context_entry);
+               /* 0xffffff means we ask to work with cqe version 0 */
+               MLX5_SET(xrc_srqc, xsrqc, user_index, MLX5_IB_DEFAULT_UIDX);
+       }
+
        return 0;
 
 err_in:
index 76fb7b927d373ef2710d412ec015309fb24fdd72..b94a55404a596dab323a843dec9a817fddbaff74 100644 (file)
@@ -35,6 +35,8 @@
 
 #include <linux/types.h>
 
+#include "mlx5_ib.h"
+
 enum {
        MLX5_QP_FLAG_SIGNATURE          = 1 << 0,
        MLX5_QP_FLAG_SCATTER_CQE        = 1 << 1,
@@ -66,7 +68,15 @@ struct mlx5_ib_alloc_ucontext_req_v2 {
        __u32   total_num_uuars;
        __u32   num_low_latency_uuars;
        __u32   flags;
-       __u32   reserved;
+       __u32   comp_mask;
+       __u8    max_cqe_version;
+       __u8    reserved0;
+       __u16   reserved1;
+       __u32   reserved2;
+};
+
+enum mlx5_ib_alloc_ucontext_resp_mask {
+       MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET = 1UL << 0,
 };
 
 struct mlx5_ib_alloc_ucontext_resp {
@@ -80,7 +90,13 @@ struct mlx5_ib_alloc_ucontext_resp {
        __u32   max_recv_wr;
        __u32   max_srq_recv_wr;
        __u16   num_ports;
-       __u16   reserved;
+       __u16   reserved1;
+       __u32   comp_mask;
+       __u32   response_length;
+       __u8    cqe_version;
+       __u8    reserved2;
+       __u16   reserved3;
+       __u64   hca_core_clock_offset;
 };
 
 struct mlx5_ib_alloc_pd_resp {
@@ -110,7 +126,9 @@ struct mlx5_ib_create_srq {
        __u64   buf_addr;
        __u64   db_addr;
        __u32   flags;
-       __u32   reserved; /* explicit padding (optional on i386) */
+       __u32   reserved0; /* explicit padding (optional on i386) */
+       __u32   uidx;
+       __u32   reserved1;
 };
 
 struct mlx5_ib_create_srq_resp {
@@ -125,9 +143,48 @@ struct mlx5_ib_create_qp {
        __u32   rq_wqe_count;
        __u32   rq_wqe_shift;
        __u32   flags;
+       __u32   uidx;
+       __u32   reserved0;
+       __u64   sq_buf_addr;
 };
 
 struct mlx5_ib_create_qp_resp {
        __u32   uuar_index;
 };
+
+static inline int get_qp_user_index(struct mlx5_ib_ucontext *ucontext,
+                                   struct mlx5_ib_create_qp *ucmd,
+                                   int inlen,
+                                   u32 *user_index)
+{
+       u8 cqe_version = ucontext->cqe_version;
+
+       if (field_avail(struct mlx5_ib_create_qp, uidx, inlen) &&
+           !cqe_version && (ucmd->uidx == MLX5_IB_DEFAULT_UIDX))
+               return 0;
+
+       if (!!(field_avail(struct mlx5_ib_create_qp, uidx, inlen) !=
+              !!cqe_version))
+               return -EINVAL;
+
+       return verify_assign_uidx(cqe_version, ucmd->uidx, user_index);
+}
+
+static inline int get_srq_user_index(struct mlx5_ib_ucontext *ucontext,
+                                    struct mlx5_ib_create_srq *ucmd,
+                                    int inlen,
+                                    u32 *user_index)
+{
+       u8 cqe_version = ucontext->cqe_version;
+
+       if (field_avail(struct mlx5_ib_create_srq, uidx, inlen) &&
+           !cqe_version && (ucmd->uidx == MLX5_IB_DEFAULT_UIDX))
+               return 0;
+
+       if (!!(field_avail(struct mlx5_ib_create_srq, uidx, inlen) !=
+              !!cqe_version))
+               return -EINVAL;
+
+       return verify_assign_uidx(cqe_version, ucmd->uidx, user_index);
+}
 #endif /* MLX5_IB_USER_H */
index 40ba8333815571ff2b8c5e24e173713f3a16e66e..a6531ffe29a6f7afebd52ae41223f7fd7d74d131 100644 (file)
@@ -608,9 +608,6 @@ static inline int mthca_poll_one(struct mthca_dev *dev,
                        entry->opcode    = IB_WC_FETCH_ADD;
                        entry->byte_len  = MTHCA_ATOMIC_BYTE_LEN;
                        break;
-               case MTHCA_OPCODE_BIND_MW:
-                       entry->opcode    = IB_WC_BIND_MW;
-                       break;
                default:
                        entry->opcode    = MTHCA_OPCODE_INVALID;
                        break;
index dc2d48c59e6274c54e84af0d26b4da1b56271b13..9866c35cc977d5036b8b7c6b1126a3508eef9212 100644 (file)
@@ -898,89 +898,6 @@ static struct ib_mr *mthca_get_dma_mr(struct ib_pd *pd, int acc)
        return &mr->ibmr;
 }
 
-static struct ib_mr *mthca_reg_phys_mr(struct ib_pd       *pd,
-                                      struct ib_phys_buf *buffer_list,
-                                      int                 num_phys_buf,
-                                      int                 acc,
-                                      u64                *iova_start)
-{
-       struct mthca_mr *mr;
-       u64 *page_list;
-       u64 total_size;
-       unsigned long mask;
-       int shift;
-       int npages;
-       int err;
-       int i, j, n;
-
-       mask = buffer_list[0].addr ^ *iova_start;
-       total_size = 0;
-       for (i = 0; i < num_phys_buf; ++i) {
-               if (i != 0)
-                       mask |= buffer_list[i].addr;
-               if (i != num_phys_buf - 1)
-                       mask |= buffer_list[i].addr + buffer_list[i].size;
-
-               total_size += buffer_list[i].size;
-       }
-
-       if (mask & ~PAGE_MASK)
-               return ERR_PTR(-EINVAL);
-
-       shift = __ffs(mask | 1 << 31);
-
-       buffer_list[0].size += buffer_list[0].addr & ((1ULL << shift) - 1);
-       buffer_list[0].addr &= ~0ull << shift;
-
-       mr = kmalloc(sizeof *mr, GFP_KERNEL);
-       if (!mr)
-               return ERR_PTR(-ENOMEM);
-
-       npages = 0;
-       for (i = 0; i < num_phys_buf; ++i)
-               npages += (buffer_list[i].size + (1ULL << shift) - 1) >> shift;
-
-       if (!npages)
-               return &mr->ibmr;
-
-       page_list = kmalloc(npages * sizeof *page_list, GFP_KERNEL);
-       if (!page_list) {
-               kfree(mr);
-               return ERR_PTR(-ENOMEM);
-       }
-
-       n = 0;
-       for (i = 0; i < num_phys_buf; ++i)
-               for (j = 0;
-                    j < (buffer_list[i].size + (1ULL << shift) - 1) >> shift;
-                    ++j)
-                       page_list[n++] = buffer_list[i].addr + ((u64) j << shift);
-
-       mthca_dbg(to_mdev(pd->device), "Registering memory at %llx (iova %llx) "
-                 "in PD %x; shift %d, npages %d.\n",
-                 (unsigned long long) buffer_list[0].addr,
-                 (unsigned long long) *iova_start,
-                 to_mpd(pd)->pd_num,
-                 shift, npages);
-
-       err = mthca_mr_alloc_phys(to_mdev(pd->device),
-                                 to_mpd(pd)->pd_num,
-                                 page_list, shift, npages,
-                                 *iova_start, total_size,
-                                 convert_access(acc), mr);
-
-       if (err) {
-               kfree(page_list);
-               kfree(mr);
-               return ERR_PTR(err);
-       }
-
-       kfree(page_list);
-       mr->umem = NULL;
-
-       return &mr->ibmr;
-}
-
 static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
                                       u64 virt, int acc, struct ib_udata *udata)
 {
@@ -1346,7 +1263,6 @@ int mthca_register_device(struct mthca_dev *dev)
        dev->ib_dev.destroy_cq           = mthca_destroy_cq;
        dev->ib_dev.poll_cq              = mthca_poll_cq;
        dev->ib_dev.get_dma_mr           = mthca_get_dma_mr;
-       dev->ib_dev.reg_phys_mr          = mthca_reg_phys_mr;
        dev->ib_dev.reg_user_mr          = mthca_reg_user_mr;
        dev->ib_dev.dereg_mr             = mthca_dereg_mr;
        dev->ib_dev.get_port_immutable   = mthca_port_immutable;
index 35fe506e2cfa892259a4975919b07355be3df345..96e5fb91fb48e7c48d802e8b30e7e5af20c6a630 100644 (file)
@@ -1485,7 +1485,7 @@ static int build_mlx_header(struct mthca_dev *dev, struct mthca_sqp *sqp,
        u16 pkey;
 
        ib_ud_header_init(256, /* assume a MAD */ 1, 0, 0,
-                         mthca_ah_grh_present(to_mah(wr->ah)), 0,
+                         mthca_ah_grh_present(to_mah(wr->ah)), 0, 0, 0,
                          &sqp->ud_header);
 
        err = mthca_read_ah(dev, to_mah(wr->ah), &sqp->ud_header);
index 8a3ad170d790cc336c08a527db314d859beefd6c..cb9f0f27308de0dd439844cc06742a74caa52899 100644 (file)
@@ -134,7 +134,7 @@ static void record_ird_ord(struct nes_cm_node *, u16, u16);
 /* External CM API Interface */
 /* instance of function pointers for client API */
 /* set address of this instance to cm_core->cm_ops at cm_core alloc */
-static struct nes_cm_ops nes_cm_api = {
+static const struct nes_cm_ops nes_cm_api = {
        mini_cm_accelerated,
        mini_cm_listen,
        mini_cm_del_listen,
@@ -3232,7 +3232,6 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
        int passive_state;
        struct nes_ib_device *nesibdev;
        struct ib_mr *ibmr = NULL;
-       struct ib_phys_buf ibphysbuf;
        struct nes_pd *nespd;
        u64 tagged_offset;
        u8 mpa_frame_offset = 0;
@@ -3316,21 +3315,19 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
                u64temp = (unsigned long)nesqp;
                nesibdev = nesvnic->nesibdev;
                nespd = nesqp->nespd;
-               ibphysbuf.addr = nesqp->ietf_frame_pbase + mpa_frame_offset;
-               ibphysbuf.size = buff_len;
                tagged_offset = (u64)(unsigned long)*start_buff;
-               ibmr = nesibdev->ibdev.reg_phys_mr((struct ib_pd *)nespd,
-                                                  &ibphysbuf, 1,
-                                                  IB_ACCESS_LOCAL_WRITE,
-                                                  &tagged_offset);
-               if (!ibmr) {
+               ibmr = nes_reg_phys_mr(&nespd->ibpd,
+                               nesqp->ietf_frame_pbase + mpa_frame_offset,
+                               buff_len, IB_ACCESS_LOCAL_WRITE,
+                               &tagged_offset);
+               if (IS_ERR(ibmr)) {
                        nes_debug(NES_DBG_CM, "Unable to register memory region"
                                  "for lSMM for cm_node = %p \n",
                                  cm_node);
                        pci_free_consistent(nesdev->pcidev,
                                            nesqp->private_data_len + nesqp->ietf_frame_size,
                                            nesqp->ietf_frame, nesqp->ietf_frame_pbase);
-                       return -ENOMEM;
+                       return PTR_ERR(ibmr);
                }
 
                ibmr->pd = &nespd->ibpd;
index 32a6420c29400184ea8f392c9e7004eb51611b67..147c2c884227d82ad3f40cec561a91abee71f80a 100644 (file)
@@ -423,7 +423,7 @@ struct nes_cm_core {
 
        struct timer_list       tcp_timer;
 
-       struct nes_cm_ops       *api;
+       const struct nes_cm_ops *api;
 
        int (*post_event)(struct nes_cm_event *event);
        atomic_t                events_posted;
index 2042c0f2975942a3284bce33eda298cfc32ca11d..6d3a169c049b315d764dc1eec9dd47436c53b1a4 100644 (file)
@@ -727,7 +727,7 @@ int nes_arp_table(struct nes_device *nesdev, u32 ip_addr, u8 *mac_addr, u32 acti
        if (action == NES_ARP_DELETE) {
                nes_debug(NES_DBG_NETDEV, "DELETE, arp_index=%d\n", arp_index);
                nesadapter->arp_table[arp_index].ip_addr = 0;
-               memset(nesadapter->arp_table[arp_index].mac_addr, 0x00, ETH_ALEN);
+               eth_zero_addr(nesadapter->arp_table[arp_index].mac_addr);
                nes_free_resource(nesadapter, nesadapter->allocated_arps, arp_index);
                return arp_index;
        }
index 137880a19ebe4827bc7f8b419ba6faf9dc50de41..8c4daf7f22ec14fc5a56b13f5ddeca9c09a142e7 100644 (file)
@@ -206,80 +206,6 @@ static int nes_dealloc_mw(struct ib_mw *ibmw)
 }
 
 
-/**
- * nes_bind_mw
- */
-static int nes_bind_mw(struct ib_qp *ibqp, struct ib_mw *ibmw,
-               struct ib_mw_bind *ibmw_bind)
-{
-       u64 u64temp;
-       struct nes_vnic *nesvnic = to_nesvnic(ibqp->device);
-       struct nes_device *nesdev = nesvnic->nesdev;
-       /* struct nes_mr *nesmr = to_nesmw(ibmw); */
-       struct nes_qp *nesqp = to_nesqp(ibqp);
-       struct nes_hw_qp_wqe *wqe;
-       unsigned long flags = 0;
-       u32 head;
-       u32 wqe_misc = 0;
-       u32 qsize;
-
-       if (nesqp->ibqp_state > IB_QPS_RTS)
-               return -EINVAL;
-
-       spin_lock_irqsave(&nesqp->lock, flags);
-
-       head = nesqp->hwqp.sq_head;
-       qsize = nesqp->hwqp.sq_tail;
-
-       /* Check for SQ overflow */
-       if (((head + (2 * qsize) - nesqp->hwqp.sq_tail) % qsize) == (qsize - 1)) {
-               spin_unlock_irqrestore(&nesqp->lock, flags);
-               return -ENOMEM;
-       }
-
-       wqe = &nesqp->hwqp.sq_vbase[head];
-       /* nes_debug(NES_DBG_MR, "processing sq wqe at %p, head = %u.\n", wqe, head); */
-       nes_fill_init_qp_wqe(wqe, nesqp, head);
-       u64temp = ibmw_bind->wr_id;
-       set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX, u64temp);
-       wqe_misc = NES_IWARP_SQ_OP_BIND;
-
-       wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE;
-
-       if (ibmw_bind->send_flags & IB_SEND_SIGNALED)
-               wqe_misc |= NES_IWARP_SQ_WQE_SIGNALED_COMPL;
-
-       if (ibmw_bind->bind_info.mw_access_flags & IB_ACCESS_REMOTE_WRITE)
-               wqe_misc |= NES_CQP_STAG_RIGHTS_REMOTE_WRITE;
-       if (ibmw_bind->bind_info.mw_access_flags & IB_ACCESS_REMOTE_READ)
-               wqe_misc |= NES_CQP_STAG_RIGHTS_REMOTE_READ;
-
-       set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_MISC_IDX, wqe_misc);
-       set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_MR_IDX,
-                           ibmw_bind->bind_info.mr->lkey);
-       set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_MW_IDX, ibmw->rkey);
-       set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_LENGTH_LOW_IDX,
-                       ibmw_bind->bind_info.length);
-       wqe->wqe_words[NES_IWARP_SQ_BIND_WQE_LENGTH_HIGH_IDX] = 0;
-       u64temp = (u64)ibmw_bind->bind_info.addr;
-       set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_VA_FBO_LOW_IDX, u64temp);
-
-       head++;
-       if (head >= qsize)
-               head = 0;
-
-       nesqp->hwqp.sq_head = head;
-       barrier();
-
-       nes_write32(nesdev->regs+NES_WQE_ALLOC,
-                       (1 << 24) | 0x00800000 | nesqp->hwqp.qp_id);
-
-       spin_unlock_irqrestore(&nesqp->lock, flags);
-
-       return 0;
-}
-
-
 /*
  * nes_alloc_fast_mr
  */
@@ -2074,9 +2000,8 @@ static int nes_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd,
 /**
  * nes_reg_phys_mr
  */
-static struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd,
-               struct ib_phys_buf *buffer_list, int num_phys_buf, int acc,
-               u64 * iova_start)
+struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd, u64 addr, u64 size,
+               int acc, u64 *iova_start)
 {
        u64 region_length;
        struct nes_pd *nespd = to_nespd(ib_pd);
@@ -2088,13 +2013,10 @@ static struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd,
        struct nes_vpbl vpbl;
        struct nes_root_vpbl root_vpbl;
        u32 stag;
-       u32 i;
        unsigned long mask;
        u32 stag_index = 0;
        u32 next_stag_index = 0;
        u32 driver_key = 0;
-       u32 root_pbl_index = 0;
-       u32 cur_pbl_index = 0;
        int err = 0;
        int ret = 0;
        u16 pbl_count = 0;
@@ -2113,11 +2035,8 @@ static struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd,
 
        next_stag_index >>= 8;
        next_stag_index %= nesadapter->max_mr;
-       if (num_phys_buf > (1024*512)) {
-               return ERR_PTR(-E2BIG);
-       }
 
-       if ((buffer_list[0].addr ^ *iova_start) & ~PAGE_MASK)
+       if ((addr ^ *iova_start) & ~PAGE_MASK)
                return ERR_PTR(-EINVAL);
 
        err = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs, nesadapter->max_mr,
@@ -2132,84 +2051,33 @@ static struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd,
                return ERR_PTR(-ENOMEM);
        }
 
-       for (i = 0; i < num_phys_buf; i++) {
+       /* Allocate a 4K buffer for the PBL */
+       vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096,
+                       &vpbl.pbl_pbase);
+       nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%016lX\n",
+                       vpbl.pbl_vbase, (unsigned long)vpbl.pbl_pbase);
+       if (!vpbl.pbl_vbase) {
+               nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+               ibmr = ERR_PTR(-ENOMEM);
+               kfree(nesmr);
+               goto reg_phys_err;
+       }
 
-               if ((i & 0x01FF) == 0) {
-                       if (root_pbl_index == 1) {
-                               /* Allocate the root PBL */
-                               root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 8192,
-                                               &root_vpbl.pbl_pbase);
-                               nes_debug(NES_DBG_MR, "Allocating root PBL, va = %p, pa = 0x%08X\n",
-                                               root_vpbl.pbl_vbase, (unsigned int)root_vpbl.pbl_pbase);
-                               if (!root_vpbl.pbl_vbase) {
-                                       pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
-                                                       vpbl.pbl_pbase);
-                                       nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
-                                       kfree(nesmr);
-                                       return ERR_PTR(-ENOMEM);
-                               }
-                               root_vpbl.leaf_vpbl = kzalloc(sizeof(*root_vpbl.leaf_vpbl)*1024, GFP_KERNEL);
-                               if (!root_vpbl.leaf_vpbl) {
-                                       pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase,
-                                                       root_vpbl.pbl_pbase);
-                                       pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
-                                                       vpbl.pbl_pbase);
-                                       nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
-                                       kfree(nesmr);
-                                       return ERR_PTR(-ENOMEM);
-                               }
-                               root_vpbl.pbl_vbase[0].pa_low = cpu_to_le32((u32)vpbl.pbl_pbase);
-                               root_vpbl.pbl_vbase[0].pa_high =
-                                               cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32)));
-                               root_vpbl.leaf_vpbl[0] = vpbl;
-                       }
-                       /* Allocate a 4K buffer for the PBL */
-                       vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096,
-                                       &vpbl.pbl_pbase);
-                       nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%016lX\n",
-                                       vpbl.pbl_vbase, (unsigned long)vpbl.pbl_pbase);
-                       if (!vpbl.pbl_vbase) {
-                               nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
-                               ibmr = ERR_PTR(-ENOMEM);
-                               kfree(nesmr);
-                               goto reg_phys_err;
-                       }
-                       /* Fill in the root table */
-                       if (1 <= root_pbl_index) {
-                               root_vpbl.pbl_vbase[root_pbl_index].pa_low =
-                                               cpu_to_le32((u32)vpbl.pbl_pbase);
-                               root_vpbl.pbl_vbase[root_pbl_index].pa_high =
-                                               cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32)));
-                               root_vpbl.leaf_vpbl[root_pbl_index] = vpbl;
-                       }
-                       root_pbl_index++;
-                       cur_pbl_index = 0;
-               }
 
-               mask = !buffer_list[i].size;
-               if (i != 0)
-                       mask |= buffer_list[i].addr;
-               if (i != num_phys_buf - 1)
-                       mask |= buffer_list[i].addr + buffer_list[i].size;
-
-               if (mask & ~PAGE_MASK) {
-                       nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
-                       nes_debug(NES_DBG_MR, "Invalid buffer addr or size\n");
-                       ibmr = ERR_PTR(-EINVAL);
-                       kfree(nesmr);
-                       goto reg_phys_err;
-               }
+       mask = !size;
 
-               region_length += buffer_list[i].size;
-               if ((i != 0) && (single_page)) {
-                       if ((buffer_list[i-1].addr+PAGE_SIZE) != buffer_list[i].addr)
-                               single_page = 0;
-               }
-               vpbl.pbl_vbase[cur_pbl_index].pa_low = cpu_to_le32((u32)buffer_list[i].addr & PAGE_MASK);
-               vpbl.pbl_vbase[cur_pbl_index++].pa_high =
-                               cpu_to_le32((u32)((((u64)buffer_list[i].addr) >> 32)));
+       if (mask & ~PAGE_MASK) {
+               nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+               nes_debug(NES_DBG_MR, "Invalid buffer addr or size\n");
+               ibmr = ERR_PTR(-EINVAL);
+               kfree(nesmr);
+               goto reg_phys_err;
        }
 
+       region_length += size;
+       vpbl.pbl_vbase[0].pa_low = cpu_to_le32((u32)addr & PAGE_MASK);
+       vpbl.pbl_vbase[0].pa_high = cpu_to_le32((u32)((((u64)addr) >> 32)));
+
        stag = stag_index << 8;
        stag |= driver_key;
        stag += (u32)stag_key;
@@ -2219,17 +2087,15 @@ static struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd,
                        stag, (unsigned long)*iova_start, (unsigned long)region_length, stag_index);
 
        /* Make the leaf PBL the root if only one PBL */
-       if (root_pbl_index == 1) {
-               root_vpbl.pbl_pbase = vpbl.pbl_pbase;
-       }
+       root_vpbl.pbl_pbase = vpbl.pbl_pbase;
 
        if (single_page) {
                pbl_count = 0;
        } else {
-               pbl_count = root_pbl_index;
+               pbl_count = 1;
        }
        ret = nes_reg_mr(nesdev, nespd, stag, region_length, &root_vpbl,
-                       buffer_list[0].addr, pbl_count, (u16)cur_pbl_index, acc, iova_start,
+                       addr, pbl_count, 1, acc, iova_start,
                        &nesmr->pbls_used, &nesmr->pbl_4k);
 
        if (ret == 0) {
@@ -2242,21 +2108,9 @@ static struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd,
                ibmr = ERR_PTR(-ENOMEM);
        }
 
-       reg_phys_err:
-       /* free the resources */
-       if (root_pbl_index == 1) {
-               /* single PBL case */
-               pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, vpbl.pbl_pbase);
-       } else {
-               for (i=0; i<root_pbl_index; i++) {
-                       pci_free_consistent(nesdev->pcidev, 4096, root_vpbl.leaf_vpbl[i].pbl_vbase,
-                                       root_vpbl.leaf_vpbl[i].pbl_pbase);
-               }
-               kfree(root_vpbl.leaf_vpbl);
-               pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase,
-                               root_vpbl.pbl_pbase);
-       }
-
+reg_phys_err:
+       /* single PBL case */
+       pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, vpbl.pbl_pbase);
        return ibmr;
 }
 
@@ -2266,17 +2120,13 @@ static struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd,
  */
 static struct ib_mr *nes_get_dma_mr(struct ib_pd *pd, int acc)
 {
-       struct ib_phys_buf bl;
        u64 kva = 0;
 
        nes_debug(NES_DBG_MR, "\n");
 
-       bl.size = (u64)0xffffffffffULL;
-       bl.addr = 0;
-       return nes_reg_phys_mr(pd, &bl, 1, acc, &kva);
+       return nes_reg_phys_mr(pd, 0, 0xffffffffffULL, acc, &kva);
 }
 
-
 /**
  * nes_reg_user_mr
  */
@@ -3888,12 +3738,10 @@ struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev)
        nesibdev->ibdev.destroy_cq = nes_destroy_cq;
        nesibdev->ibdev.poll_cq = nes_poll_cq;
        nesibdev->ibdev.get_dma_mr = nes_get_dma_mr;
-       nesibdev->ibdev.reg_phys_mr = nes_reg_phys_mr;
        nesibdev->ibdev.reg_user_mr = nes_reg_user_mr;
        nesibdev->ibdev.dereg_mr = nes_dereg_mr;
        nesibdev->ibdev.alloc_mw = nes_alloc_mw;
        nesibdev->ibdev.dealloc_mw = nes_dealloc_mw;
-       nesibdev->ibdev.bind_mw = nes_bind_mw;
 
        nesibdev->ibdev.alloc_mr = nes_alloc_mr;
        nesibdev->ibdev.map_mr_sg = nes_map_mr_sg;
index a204b677af22a8d00fba4b5b65bc0620d4cb7a1c..70290883d06769f3eee19ed4b36750808c2fa3d8 100644 (file)
@@ -190,4 +190,8 @@ struct nes_qp {
        u8                    pau_state;
        __u64                 nesuqp_addr;
 };
+
+struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd,
+               u64 addr, u64 size, int acc, u64 *iova_start);
+
 #endif                 /* NES_VERBS_H */
index 9820074be59d73bd1aac4b56d4a38254d2f9b7bb..3790771f2baad2bae17c52298e9b293eee3e7bad 100644 (file)
@@ -152,9 +152,10 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr)
        if ((pd->uctx) &&
            (!rdma_is_multicast_addr((struct in6_addr *)attr->grh.dgid.raw)) &&
            (!rdma_link_local_addr((struct in6_addr *)attr->grh.dgid.raw))) {
-               status = rdma_addr_find_dmac_by_grh(&sgid, &attr->grh.dgid,
-                                                   attr->dmac, &vlan_tag,
-                                                   sgid_attr.ndev->ifindex);
+               status = rdma_addr_find_l2_eth_by_grh(&sgid, &attr->grh.dgid,
+                                                     attr->dmac, &vlan_tag,
+                                                     &sgid_attr.ndev->ifindex,
+                                                     NULL);
                if (status) {
                        pr_err("%s(): Failed to resolve dmac from gid." 
                                "status = %d\n", __func__, status);
index 3afb40b85159bd2c10559443536f83af5e4fa9ca..573849354cb94f4ef6f46397266518f9aaf7ef7b 100644 (file)
@@ -175,7 +175,6 @@ static int ocrdma_register_device(struct ocrdma_dev *dev)
        dev->ibdev.req_notify_cq = ocrdma_arm_cq;
 
        dev->ibdev.get_dma_mr = ocrdma_get_dma_mr;
-       dev->ibdev.reg_phys_mr = ocrdma_reg_kernel_mr;
        dev->ibdev.dereg_mr = ocrdma_dereg_mr;
        dev->ibdev.reg_user_mr = ocrdma_reg_user_mr;
 
index 76e96f97b3f6459e68d13c444be6e1beb6b578a9..d4c687b548d8696e66752da16a28f03d2481cd60 100644 (file)
@@ -3066,169 +3066,6 @@ pl_err:
        return ERR_PTR(-ENOMEM);
 }
 
-#define MAX_KERNEL_PBE_SIZE 65536
-static inline int count_kernel_pbes(struct ib_phys_buf *buf_list,
-                                   int buf_cnt, u32 *pbe_size)
-{
-       u64 total_size = 0;
-       u64 buf_size = 0;
-       int i;
-       *pbe_size = roundup(buf_list[0].size, PAGE_SIZE);
-       *pbe_size = roundup_pow_of_two(*pbe_size);
-
-       /* find the smallest PBE size that we can have */
-       for (i = 0; i < buf_cnt; i++) {
-               /* first addr may not be page aligned, so ignore checking */
-               if ((i != 0) && ((buf_list[i].addr & ~PAGE_MASK) ||
-                                (buf_list[i].size & ~PAGE_MASK))) {
-                       return 0;
-               }
-
-               /* if configured PBE size is greater then the chosen one,
-                * reduce the PBE size.
-                */
-               buf_size = roundup(buf_list[i].size, PAGE_SIZE);
-               /* pbe_size has to be even multiple of 4K 1,2,4,8...*/
-               buf_size = roundup_pow_of_two(buf_size);
-               if (*pbe_size > buf_size)
-                       *pbe_size = buf_size;
-
-               total_size += buf_size;
-       }
-       *pbe_size = *pbe_size > MAX_KERNEL_PBE_SIZE ?
-           (MAX_KERNEL_PBE_SIZE) : (*pbe_size);
-
-       /* num_pbes = total_size / (*pbe_size);  this is implemented below. */
-
-       return total_size >> ilog2(*pbe_size);
-}
-
-static void build_kernel_pbes(struct ib_phys_buf *buf_list, int ib_buf_cnt,
-                             u32 pbe_size, struct ocrdma_pbl *pbl_tbl,
-                             struct ocrdma_hw_mr *hwmr)
-{
-       int i;
-       int idx;
-       int pbes_per_buf = 0;
-       u64 buf_addr = 0;
-       int num_pbes;
-       struct ocrdma_pbe *pbe;
-       int total_num_pbes = 0;
-
-       if (!hwmr->num_pbes)
-               return;
-
-       pbe = (struct ocrdma_pbe *)pbl_tbl->va;
-       num_pbes = 0;
-
-       /* go through the OS phy regions & fill hw pbe entries into pbls. */
-       for (i = 0; i < ib_buf_cnt; i++) {
-               buf_addr = buf_list[i].addr;
-               pbes_per_buf =
-                   roundup_pow_of_two(roundup(buf_list[i].size, PAGE_SIZE)) /
-                   pbe_size;
-               hwmr->len += buf_list[i].size;
-               /* number of pbes can be more for one OS buf, when
-                * buffers are of different sizes.
-                * split the ib_buf to one or more pbes.
-                */
-               for (idx = 0; idx < pbes_per_buf; idx++) {
-                       /* we program always page aligned addresses,
-                        * first unaligned address is taken care by fbo.
-                        */
-                       if (i == 0) {
-                               /* for non zero fbo, assign the
-                                * start of the page.
-                                */
-                               pbe->pa_lo =
-                                   cpu_to_le32((u32) (buf_addr & PAGE_MASK));
-                               pbe->pa_hi =
-                                   cpu_to_le32((u32) upper_32_bits(buf_addr));
-                       } else {
-                               pbe->pa_lo =
-                                   cpu_to_le32((u32) (buf_addr & 0xffffffff));
-                               pbe->pa_hi =
-                                   cpu_to_le32((u32) upper_32_bits(buf_addr));
-                       }
-                       buf_addr += pbe_size;
-                       num_pbes += 1;
-                       total_num_pbes += 1;
-                       pbe++;
-
-                       if (total_num_pbes == hwmr->num_pbes)
-                               goto mr_tbl_done;
-                       /* if the pbl is full storing the pbes,
-                        * move to next pbl.
-                        */
-                       if (num_pbes == (hwmr->pbl_size/sizeof(u64))) {
-                               pbl_tbl++;
-                               pbe = (struct ocrdma_pbe *)pbl_tbl->va;
-                               num_pbes = 0;
-                       }
-               }
-       }
-mr_tbl_done:
-       return;
-}
-
-struct ib_mr *ocrdma_reg_kernel_mr(struct ib_pd *ibpd,
-                                  struct ib_phys_buf *buf_list,
-                                  int buf_cnt, int acc, u64 *iova_start)
-{
-       int status = -ENOMEM;
-       struct ocrdma_mr *mr;
-       struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
-       struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
-       u32 num_pbes;
-       u32 pbe_size = 0;
-
-       if ((acc & IB_ACCESS_REMOTE_WRITE) && !(acc & IB_ACCESS_LOCAL_WRITE))
-               return ERR_PTR(-EINVAL);
-
-       mr = kzalloc(sizeof(*mr), GFP_KERNEL);
-       if (!mr)
-               return ERR_PTR(status);
-
-       num_pbes = count_kernel_pbes(buf_list, buf_cnt, &pbe_size);
-       if (num_pbes == 0) {
-               status = -EINVAL;
-               goto pbl_err;
-       }
-       status = ocrdma_get_pbl_info(dev, mr, num_pbes);
-       if (status)
-               goto pbl_err;
-
-       mr->hwmr.pbe_size = pbe_size;
-       mr->hwmr.fbo = *iova_start - (buf_list[0].addr & PAGE_MASK);
-       mr->hwmr.va = *iova_start;
-       mr->hwmr.local_rd = 1;
-       mr->hwmr.remote_wr = (acc & IB_ACCESS_REMOTE_WRITE) ? 1 : 0;
-       mr->hwmr.remote_rd = (acc & IB_ACCESS_REMOTE_READ) ? 1 : 0;
-       mr->hwmr.local_wr = (acc & IB_ACCESS_LOCAL_WRITE) ? 1 : 0;
-       mr->hwmr.remote_atomic = (acc & IB_ACCESS_REMOTE_ATOMIC) ? 1 : 0;
-       mr->hwmr.mw_bind = (acc & IB_ACCESS_MW_BIND) ? 1 : 0;
-
-       status = ocrdma_build_pbl_tbl(dev, &mr->hwmr);
-       if (status)
-               goto pbl_err;
-       build_kernel_pbes(buf_list, buf_cnt, pbe_size, mr->hwmr.pbl_table,
-                         &mr->hwmr);
-       status = ocrdma_reg_mr(dev, &mr->hwmr, pd->id, acc);
-       if (status)
-               goto mbx_err;
-
-       mr->ibmr.lkey = mr->hwmr.lkey;
-       if (mr->hwmr.remote_wr || mr->hwmr.remote_rd)
-               mr->ibmr.rkey = mr->hwmr.lkey;
-       return &mr->ibmr;
-
-mbx_err:
-       ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr);
-pbl_err:
-       kfree(mr);
-       return ERR_PTR(status);
-}
-
 static int ocrdma_set_page(struct ib_mr *ibmr, u64 addr)
 {
        struct ocrdma_mr *mr = get_ocrdma_mr(ibmr);
index a2f3b4dc20b0363d0a09769aaee2d69dd957ae33..8b517fd3677924099a1d08663035de8f19a09d79 100644 (file)
@@ -117,9 +117,6 @@ int ocrdma_post_srq_recv(struct ib_srq *, struct ib_recv_wr *,
 
 int ocrdma_dereg_mr(struct ib_mr *);
 struct ib_mr *ocrdma_get_dma_mr(struct ib_pd *, int acc);
-struct ib_mr *ocrdma_reg_kernel_mr(struct ib_pd *,
-                                  struct ib_phys_buf *buffer_list,
-                                  int num_phys_buf, int acc, u64 *iova_start);
 struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *, u64 start, u64 length,
                                 u64 virt, int acc, struct ib_udata *);
 struct ib_mr *ocrdma_alloc_mr(struct ib_pd *pd,
index 13ef22bd9459200309f2ab9fccb248eccbef9027..fcdf37913a264d52b5204f771523b37a2b0c4985 100644 (file)
@@ -89,14 +89,14 @@ static int create_file(const char *name, umode_t mode,
 {
        int error;
 
-       mutex_lock(&d_inode(parent)->i_mutex);
+       inode_lock(d_inode(parent));
        *dentry = lookup_one_len(name, parent, strlen(name));
        if (!IS_ERR(*dentry))
                error = qibfs_mknod(d_inode(parent), *dentry,
                                    mode, fops, data);
        else
                error = PTR_ERR(*dentry);
-       mutex_unlock(&d_inode(parent)->i_mutex);
+       inode_unlock(d_inode(parent));
 
        return error;
 }
@@ -481,7 +481,7 @@ static int remove_device_files(struct super_block *sb,
        int ret, i;
 
        root = dget(sb->s_root);
-       mutex_lock(&d_inode(root)->i_mutex);
+       inode_lock(d_inode(root));
        snprintf(unit, sizeof(unit), "%u", dd->unit);
        dir = lookup_one_len(unit, root, strlen(unit));
 
@@ -491,7 +491,7 @@ static int remove_device_files(struct super_block *sb,
                goto bail;
        }
 
-       mutex_lock(&d_inode(dir)->i_mutex);
+       inode_lock(d_inode(dir));
        remove_file(dir, "counters");
        remove_file(dir, "counter_names");
        remove_file(dir, "portcounter_names");
@@ -506,13 +506,13 @@ static int remove_device_files(struct super_block *sb,
                }
        }
        remove_file(dir, "flash");
-       mutex_unlock(&d_inode(dir)->i_mutex);
+       inode_unlock(d_inode(dir));
        ret = simple_rmdir(d_inode(root), dir);
        d_delete(dir);
        dput(dir);
 
 bail:
-       mutex_unlock(&d_inode(root)->i_mutex);
+       inode_unlock(d_inode(root));
        dput(root);
        return ret;
 }
index 294f5c706be972b4b6563e37a32064fade665be2..5f53304e8a9b5a4c71448ba02865d912915d13a1 100644 (file)
@@ -150,10 +150,7 @@ static struct qib_mr *alloc_mr(int count, struct ib_pd *pd)
        rval = init_qib_mregion(&mr->mr, pd, count);
        if (rval)
                goto bail;
-       /*
-        * ib_reg_phys_mr() will initialize mr->ibmr except for
-        * lkey and rkey.
-        */
+
        rval = qib_alloc_lkey(&mr->mr, 0);
        if (rval)
                goto bail_mregion;
@@ -170,52 +167,6 @@ bail:
        goto done;
 }
 
-/**
- * qib_reg_phys_mr - register a physical memory region
- * @pd: protection domain for this memory region
- * @buffer_list: pointer to the list of physical buffers to register
- * @num_phys_buf: the number of physical buffers to register
- * @iova_start: the starting address passed over IB which maps to this MR
- *
- * Returns the memory region on success, otherwise returns an errno.
- */
-struct ib_mr *qib_reg_phys_mr(struct ib_pd *pd,
-                             struct ib_phys_buf *buffer_list,
-                             int num_phys_buf, int acc, u64 *iova_start)
-{
-       struct qib_mr *mr;
-       int n, m, i;
-       struct ib_mr *ret;
-
-       mr = alloc_mr(num_phys_buf, pd);
-       if (IS_ERR(mr)) {
-               ret = (struct ib_mr *)mr;
-               goto bail;
-       }
-
-       mr->mr.user_base = *iova_start;
-       mr->mr.iova = *iova_start;
-       mr->mr.access_flags = acc;
-
-       m = 0;
-       n = 0;
-       for (i = 0; i < num_phys_buf; i++) {
-               mr->mr.map[m]->segs[n].vaddr = (void *) buffer_list[i].addr;
-               mr->mr.map[m]->segs[n].length = buffer_list[i].size;
-               mr->mr.length += buffer_list[i].size;
-               n++;
-               if (n == QIB_SEGSZ) {
-                       m++;
-                       n = 0;
-               }
-       }
-
-       ret = &mr->ibmr;
-
-bail:
-       return ret;
-}
-
 /**
  * qib_reg_user_mr - register a userspace memory region
  * @pd: protection domain for this memory region
index 40f85bb3e0d3bdce5289a5c8c9c2418df33e4ca4..3eff35c2d453f7b11f9475f5120cb4df26f94dda 100644 (file)
@@ -100,9 +100,10 @@ static u32 credit_table[31] = {
        32768                   /* 1E */
 };
 
-static void get_map_page(struct qib_qpn_table *qpt, struct qpn_map *map)
+static void get_map_page(struct qib_qpn_table *qpt, struct qpn_map *map,
+                        gfp_t gfp)
 {
-       unsigned long page = get_zeroed_page(GFP_KERNEL);
+       unsigned long page = get_zeroed_page(gfp);
 
        /*
         * Free the page if someone raced with us installing it.
@@ -121,7 +122,7 @@ static void get_map_page(struct qib_qpn_table *qpt, struct qpn_map *map)
  * zero/one for QP type IB_QPT_SMI/IB_QPT_GSI.
  */
 static int alloc_qpn(struct qib_devdata *dd, struct qib_qpn_table *qpt,
-                    enum ib_qp_type type, u8 port)
+                    enum ib_qp_type type, u8 port, gfp_t gfp)
 {
        u32 i, offset, max_scan, qpn;
        struct qpn_map *map;
@@ -151,7 +152,7 @@ static int alloc_qpn(struct qib_devdata *dd, struct qib_qpn_table *qpt,
        max_scan = qpt->nmaps - !offset;
        for (i = 0;;) {
                if (unlikely(!map->page)) {
-                       get_map_page(qpt, map);
+                       get_map_page(qpt, map, gfp);
                        if (unlikely(!map->page))
                                break;
                }
@@ -983,13 +984,21 @@ struct ib_qp *qib_create_qp(struct ib_pd *ibpd,
        size_t sz;
        size_t sg_list_sz;
        struct ib_qp *ret;
+       gfp_t gfp;
+
 
        if (init_attr->cap.max_send_sge > ib_qib_max_sges ||
            init_attr->cap.max_send_wr > ib_qib_max_qp_wrs ||
-           init_attr->create_flags) {
-               ret = ERR_PTR(-EINVAL);
-               goto bail;
-       }
+           init_attr->create_flags & ~(IB_QP_CREATE_USE_GFP_NOIO))
+               return ERR_PTR(-EINVAL);
+
+       /* GFP_NOIO is applicable in RC QPs only */
+       if (init_attr->create_flags & IB_QP_CREATE_USE_GFP_NOIO &&
+           init_attr->qp_type != IB_QPT_RC)
+               return ERR_PTR(-EINVAL);
+
+       gfp = init_attr->create_flags & IB_QP_CREATE_USE_GFP_NOIO ?
+                       GFP_NOIO : GFP_KERNEL;
 
        /* Check receive queue parameters if no SRQ is specified. */
        if (!init_attr->srq) {
@@ -1021,7 +1030,8 @@ struct ib_qp *qib_create_qp(struct ib_pd *ibpd,
                sz = sizeof(struct qib_sge) *
                        init_attr->cap.max_send_sge +
                        sizeof(struct qib_swqe);
-               swq = vmalloc((init_attr->cap.max_send_wr + 1) * sz);
+               swq = __vmalloc((init_attr->cap.max_send_wr + 1) * sz,
+                               gfp, PAGE_KERNEL);
                if (swq == NULL) {
                        ret = ERR_PTR(-ENOMEM);
                        goto bail;
@@ -1037,13 +1047,13 @@ struct ib_qp *qib_create_qp(struct ib_pd *ibpd,
                } else if (init_attr->cap.max_recv_sge > 1)
                        sg_list_sz = sizeof(*qp->r_sg_list) *
                                (init_attr->cap.max_recv_sge - 1);
-               qp = kzalloc(sz + sg_list_sz, GFP_KERNEL);
+               qp = kzalloc(sz + sg_list_sz, gfp);
                if (!qp) {
                        ret = ERR_PTR(-ENOMEM);
                        goto bail_swq;
                }
                RCU_INIT_POINTER(qp->next, NULL);
-               qp->s_hdr = kzalloc(sizeof(*qp->s_hdr), GFP_KERNEL);
+               qp->s_hdr = kzalloc(sizeof(*qp->s_hdr), gfp);
                if (!qp->s_hdr) {
                        ret = ERR_PTR(-ENOMEM);
                        goto bail_qp;
@@ -1058,8 +1068,16 @@ struct ib_qp *qib_create_qp(struct ib_pd *ibpd,
                        qp->r_rq.max_sge = init_attr->cap.max_recv_sge;
                        sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) +
                                sizeof(struct qib_rwqe);
-                       qp->r_rq.wq = vmalloc_user(sizeof(struct qib_rwq) +
-                                                  qp->r_rq.size * sz);
+                       if (gfp != GFP_NOIO)
+                               qp->r_rq.wq = vmalloc_user(
+                                               sizeof(struct qib_rwq) +
+                                               qp->r_rq.size * sz);
+                       else
+                               qp->r_rq.wq = __vmalloc(
+                                               sizeof(struct qib_rwq) +
+                                               qp->r_rq.size * sz,
+                                               gfp, PAGE_KERNEL);
+
                        if (!qp->r_rq.wq) {
                                ret = ERR_PTR(-ENOMEM);
                                goto bail_qp;
@@ -1090,7 +1108,7 @@ struct ib_qp *qib_create_qp(struct ib_pd *ibpd,
                dev = to_idev(ibpd->device);
                dd = dd_from_dev(dev);
                err = alloc_qpn(dd, &dev->qpn_table, init_attr->qp_type,
-                               init_attr->port_num);
+                               init_attr->port_num, gfp);
                if (err < 0) {
                        ret = ERR_PTR(err);
                        vfree(qp->r_rq.wq);
index de6cb6fcda8df3d5a1060aceecf20c832ba81ace..baf1e42b6896cbd0efcf29e0d52aeed3c20e1e21 100644 (file)
@@ -346,6 +346,7 @@ static int qib_post_one_send(struct qib_qp *qp, struct ib_send_wr *wr,
        unsigned long flags;
        struct qib_lkey_table *rkt;
        struct qib_pd *pd;
+       int avoid_schedule = 0;
 
        spin_lock_irqsave(&qp->s_lock, flags);
 
@@ -438,11 +439,15 @@ static int qib_post_one_send(struct qib_qp *qp, struct ib_send_wr *wr,
            qp->ibqp.qp_type == IB_QPT_RC) {
                if (wqe->length > 0x80000000U)
                        goto bail_inval_free;
+               if (wqe->length <= qp->pmtu)
+                       avoid_schedule = 1;
        } else if (wqe->length > (dd_from_ibdev(qp->ibqp.device)->pport +
-                                 qp->port_num - 1)->ibmtu)
+                                 qp->port_num - 1)->ibmtu) {
                goto bail_inval_free;
-       else
+       } else {
                atomic_inc(&to_iah(ud_wr(wr)->ah)->refcount);
+               avoid_schedule = 1;
+       }
        wqe->ssn = qp->s_ssn++;
        qp->s_head = next;
 
@@ -458,7 +463,7 @@ bail_inval_free:
 bail_inval:
        ret = -EINVAL;
 bail:
-       if (!ret && !wr->next &&
+       if (!ret && !wr->next && !avoid_schedule &&
         !qib_sdma_empty(
           dd_from_ibdev(qp->ibqp.device)->pport + qp->port_num - 1)) {
                qib_schedule_send(qp);
@@ -2256,7 +2261,6 @@ int qib_register_ib_device(struct qib_devdata *dd)
        ibdev->poll_cq = qib_poll_cq;
        ibdev->req_notify_cq = qib_req_notify_cq;
        ibdev->get_dma_mr = qib_get_dma_mr;
-       ibdev->reg_phys_mr = qib_reg_phys_mr;
        ibdev->reg_user_mr = qib_reg_user_mr;
        ibdev->dereg_mr = qib_dereg_mr;
        ibdev->alloc_mr = qib_alloc_mr;
index bc803f33d5f6883200ca21e462febebb94393cd4..6c5e77753d858da2d9adb0ddfc1cd753b1e3fb07 100644 (file)
@@ -1032,10 +1032,6 @@ int qib_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
 
 struct ib_mr *qib_get_dma_mr(struct ib_pd *pd, int acc);
 
-struct ib_mr *qib_reg_phys_mr(struct ib_pd *pd,
-                             struct ib_phys_buf *buffer_list,
-                             int num_phys_buf, int acc, u64 *iova_start);
-
 struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
                              u64 virt_addr, int mr_access_flags,
                              struct ib_udata *udata);
index f8ea069a3eafca4078ab70848c3804867609143c..b2fb5286dbd98250534964887a8c2c46678cd725 100644 (file)
@@ -286,15 +286,13 @@ int qib_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
        struct qib_ibdev *dev = to_idev(ibqp->device);
        struct qib_ibport *ibp = to_iport(ibqp->device, qp->port_num);
        struct qib_mcast *mcast = NULL;
-       struct qib_mcast_qp *p, *tmp;
+       struct qib_mcast_qp *p, *tmp, *delp = NULL;
        struct rb_node *n;
        int last = 0;
        int ret;
 
-       if (ibqp->qp_num <= 1 || qp->state == IB_QPS_RESET) {
-               ret = -EINVAL;
-               goto bail;
-       }
+       if (ibqp->qp_num <= 1 || qp->state == IB_QPS_RESET)
+               return -EINVAL;
 
        spin_lock_irq(&ibp->lock);
 
@@ -303,8 +301,7 @@ int qib_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
        while (1) {
                if (n == NULL) {
                        spin_unlock_irq(&ibp->lock);
-                       ret = -EINVAL;
-                       goto bail;
+                       return -EINVAL;
                }
 
                mcast = rb_entry(n, struct qib_mcast, rb_node);
@@ -328,6 +325,7 @@ int qib_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
                 */
                list_del_rcu(&p->list);
                mcast->n_attached--;
+               delp = p;
 
                /* If this was the last attached QP, remove the GID too. */
                if (list_empty(&mcast->qp_list)) {
@@ -338,15 +336,16 @@ int qib_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
        }
 
        spin_unlock_irq(&ibp->lock);
+       /* QP not attached */
+       if (!delp)
+               return -EINVAL;
+       /*
+        * Wait for any list walkers to finish before freeing the
+        * list element.
+        */
+       wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1);
+       qib_mcast_qp_free(delp);
 
-       if (p) {
-               /*
-                * Wait for any list walkers to finish before freeing the
-                * list element.
-                */
-               wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1);
-               qib_mcast_qp_free(p);
-       }
        if (last) {
                atomic_dec(&mcast->refcount);
                wait_event(mcast->wait, !atomic_read(&mcast->refcount));
@@ -355,11 +354,7 @@ int qib_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
                dev->n_mcast_grps_allocated--;
                spin_unlock_irq(&dev->n_mcast_grps_lock);
        }
-
-       ret = 0;
-
-bail:
-       return ret;
+       return 0;
 }
 
 int qib_mcast_tree_empty(struct qib_ibport *ibp)
index 5e55b8bc6fe402af423118c1454b5bc67d21d8ba..92dc66cc2d50a06ea69fb259f713784a26868e2b 100644 (file)
@@ -157,8 +157,9 @@ void usnic_debugfs_flow_add(struct usnic_ib_qp_grp_flow *qp_flow)
                                                        qp_flow,
                                                        &flowinfo_ops);
        if (IS_ERR_OR_NULL(qp_flow->dbgfs_dentry)) {
-               usnic_err("Failed to create dbg fs entry for flow %u\n",
-                               qp_flow->flow->flow_id);
+               usnic_err("Failed to create dbg fs entry for flow %u with error %ld\n",
+                               qp_flow->flow->flow_id,
+                               PTR_ERR(qp_flow->dbgfs_dentry));
        }
 }
 
index fcea3a24d3eb310ef0ee3c573a3ef9e161c8e809..5f44b66ccb86481774733960b3577e83fcdfabc0 100644 (file)
@@ -521,7 +521,7 @@ int usnic_ib_qp_grp_modify(struct usnic_ib_qp_grp *qp_grp,
 
        if (!status) {
                qp_grp->state = new_state;
-               usnic_info("Transistioned %u from %s to %s",
+               usnic_info("Transitioned %u from %s to %s",
                qp_grp->grp_id,
                usnic_ib_qp_grp_state_to_string(old_state),
                usnic_ib_qp_grp_state_to_string(new_state));
@@ -575,7 +575,7 @@ alloc_res_chunk_list(struct usnic_vnic *vnic,
        return res_chunk_list;
 
 out_free_res:
-       for (i--; i > 0; i--)
+       for (i--; i >= 0; i--)
                usnic_vnic_put_resources(res_chunk_list[i]);
        kfree(res_chunk_list);
        return ERR_PTR(err);
index f8e3211689a3453164c9044f3bef4601053c1dba..6cdb4d23f78f99c0282be7d202083f5aa63091e7 100644 (file)
@@ -51,7 +51,7 @@
 
 static void usnic_ib_fw_string_to_u64(char *fw_ver_str, u64 *fw_ver)
 {
-       *fw_ver = (u64) *fw_ver_str;
+       *fw_ver = *((u64 *)fw_ver_str);
 }
 
 static int usnic_ib_fill_create_qp_resp(struct usnic_ib_qp_grp *qp_grp,
@@ -571,20 +571,20 @@ int usnic_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 
        qp_grp = to_uqp_grp(ibqp);
 
-       /* TODO: Future Support All States */
        mutex_lock(&qp_grp->vf->pf->usdev_lock);
-       if ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_INIT) {
-               status = usnic_ib_qp_grp_modify(qp_grp, IB_QPS_INIT, NULL);
-       } else if ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_RTR) {
-               status = usnic_ib_qp_grp_modify(qp_grp, IB_QPS_RTR, NULL);
-       } else if ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_RTS) {
-               status = usnic_ib_qp_grp_modify(qp_grp, IB_QPS_RTS, NULL);
+       if ((attr_mask & IB_QP_PORT) && attr->port_num != 1) {
+               /* usnic devices only have one port */
+               status = -EINVAL;
+               goto out_unlock;
+       }
+       if (attr_mask & IB_QP_STATE) {
+               status = usnic_ib_qp_grp_modify(qp_grp, attr->qp_state, NULL);
        } else {
-               usnic_err("Unexpected combination mask: %u state: %u\n",
-                               attr_mask & IB_QP_STATE, attr->qp_state);
+               usnic_err("Unhandled request, attr_mask=0x%x\n", attr_mask);
                status = -EINVAL;
        }
 
+out_unlock:
        mutex_unlock(&qp_grp->vf->pf->usdev_lock);
        return status;
 }
@@ -625,8 +625,8 @@ struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length,
                        virt_addr, length);
 
        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
-       if (IS_ERR_OR_NULL(mr))
-               return ERR_PTR(mr ? PTR_ERR(mr) : -ENOMEM);
+       if (!mr)
+               return ERR_PTR(-ENOMEM);
 
        mr->umem = usnic_uiom_reg_get(to_upd(pd)->umem_pd, start, length,
                                        access_flags, 0);
index 414eaa566bd94e5f05a796a5416ef8f76e7939f6..0d9d2e6a14d5e73137e351036486dbfa2b21af60 100644 (file)
@@ -43,8 +43,6 @@ int usnic_ib_query_device(struct ib_device *ibdev,
                          struct ib_udata *uhw);
 int usnic_ib_query_port(struct ib_device *ibdev, u8 port,
                                struct ib_port_attr *props);
-enum rdma_protocol_type
-usnic_ib_query_protocol(struct ib_device *device, u8 port_num);
 int usnic_ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
                                int qp_attr_mask,
                                struct ib_qp_init_attr *qp_init_attr);
index 66de93fb8ea934c15051f7b33fa7808306f19f05..8875107186902fe8092b24d15a2372aedb6592c1 100644 (file)
@@ -237,7 +237,7 @@ usnic_vnic_get_resources(struct usnic_vnic *vnic, enum usnic_vnic_res_type type,
        struct usnic_vnic_res *res;
        int i;
 
-       if (usnic_vnic_res_free_cnt(vnic, type) < cnt || cnt < 1 || !owner)
+       if (usnic_vnic_res_free_cnt(vnic, type) < cnt || cnt < 0 || !owner)
                return ERR_PTR(-EINVAL);
 
        ret = kzalloc(sizeof(*ret), GFP_ATOMIC);
@@ -247,26 +247,28 @@ usnic_vnic_get_resources(struct usnic_vnic *vnic, enum usnic_vnic_res_type type,
                return ERR_PTR(-ENOMEM);
        }
 
-       ret->res = kzalloc(sizeof(*(ret->res))*cnt, GFP_ATOMIC);
-       if (!ret->res) {
-               usnic_err("Failed to allocate resources for %s. Out of memory\n",
-                               usnic_vnic_pci_name(vnic));
-               kfree(ret);
-               return ERR_PTR(-ENOMEM);
-       }
+       if (cnt > 0) {
+               ret->res = kcalloc(cnt, sizeof(*(ret->res)), GFP_ATOMIC);
+               if (!ret->res) {
+                       usnic_err("Failed to allocate resources for %s. Out of memory\n",
+                                       usnic_vnic_pci_name(vnic));
+                       kfree(ret);
+                       return ERR_PTR(-ENOMEM);
+               }
 
-       spin_lock(&vnic->res_lock);
-       src = &vnic->chunks[type];
-       for (i = 0; i < src->cnt && ret->cnt < cnt; i++) {
-               res = src->res[i];
-               if (!res->owner) {
-                       src->free_cnt--;
-                       res->owner = owner;
-                       ret->res[ret->cnt++] = res;
+               spin_lock(&vnic->res_lock);
+               src = &vnic->chunks[type];
+               for (i = 0; i < src->cnt && ret->cnt < cnt; i++) {
+                       res = src->res[i];
+                       if (!res->owner) {
+                               src->free_cnt--;
+                               res->owner = owner;
+                               ret->res[ret->cnt++] = res;
+                       }
                }
-       }
 
-       spin_unlock(&vnic->res_lock);
+               spin_unlock(&vnic->res_lock);
+       }
        ret->type = type;
        ret->vnic = vnic;
        WARN_ON(ret->cnt != cnt);
@@ -281,14 +283,16 @@ void usnic_vnic_put_resources(struct usnic_vnic_res_chunk *chunk)
        int i;
        struct usnic_vnic *vnic = chunk->vnic;
 
-       spin_lock(&vnic->res_lock);
-       while ((i = --chunk->cnt) >= 0) {
-               res = chunk->res[i];
-               chunk->res[i] = NULL;
-               res->owner = NULL;
-               vnic->chunks[res->type].free_cnt++;
+       if (chunk->cnt > 0) {
+               spin_lock(&vnic->res_lock);
+               while ((i = --chunk->cnt) >= 0) {
+                       res = chunk->res[i];
+                       chunk->res[i] = NULL;
+                       res->owner = NULL;
+                       vnic->chunks[res->type].free_cnt++;
+               }
+               spin_unlock(&vnic->res_lock);
        }
-       spin_unlock(&vnic->res_lock);
 
        kfree(chunk->res);
        kfree(chunk);
index 3ede103097547d4355646d322b5570ebb421d2ec..a6f3eab0f350ef036cfb44a47fbbafe5f9bd1fc1 100644 (file)
@@ -495,7 +495,6 @@ void ipoib_dev_cleanup(struct net_device *dev);
 void ipoib_mcast_join_task(struct work_struct *work);
 void ipoib_mcast_carrier_on_task(struct work_struct *work);
 void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb);
-void ipoib_mcast_free(struct ipoib_mcast *mc);
 
 void ipoib_mcast_restart_task(struct work_struct *work);
 int ipoib_mcast_start_thread(struct net_device *dev);
@@ -549,8 +548,9 @@ void ipoib_path_iter_read(struct ipoib_path_iter *iter,
 
 int ipoib_mcast_attach(struct net_device *dev, u16 mlid,
                       union ib_gid *mgid, int set_qkey);
-int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast);
-struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, void *mgid);
+void ipoib_mcast_remove_list(struct list_head *remove_list);
+void ipoib_check_and_add_mcast_sendonly(struct ipoib_dev_priv *priv, u8 *mgid,
+                               struct list_head *remove_list);
 
 int ipoib_init_qp(struct net_device *dev);
 int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca);
index 3ae9726efb9837512a62214705bf5e8e9561a02c..917e46ea3bf681681a4abba5e9ce6e86514d07eb 100644 (file)
@@ -70,7 +70,6 @@ static struct ib_qp_attr ipoib_cm_err_attr = {
 #define IPOIB_CM_RX_DRAIN_WRID 0xffffffff
 
 static struct ib_send_wr ipoib_cm_rx_drain_wr = {
-       .wr_id = IPOIB_CM_RX_DRAIN_WRID,
        .opcode = IB_WR_SEND,
 };
 
@@ -223,6 +222,7 @@ static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv)
         * error" WC will be immediately generated for each WR we post.
         */
        p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list);
+       ipoib_cm_rx_drain_wr.wr_id = IPOIB_CM_RX_DRAIN_WRID;
        if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr))
                ipoib_warn(priv, "failed to post drain wr\n");
 
@@ -1522,8 +1522,7 @@ static void ipoib_cm_create_srq(struct net_device *dev, int max_sge)
 int ipoib_cm_dev_init(struct net_device *dev)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
-       int i, ret;
-       struct ib_device_attr attr;
+       int max_srq_sge, i;
 
        INIT_LIST_HEAD(&priv->cm.passive_ids);
        INIT_LIST_HEAD(&priv->cm.reap_list);
@@ -1540,19 +1539,13 @@ int ipoib_cm_dev_init(struct net_device *dev)
 
        skb_queue_head_init(&priv->cm.skb_queue);
 
-       ret = ib_query_device(priv->ca, &attr);
-       if (ret) {
-               printk(KERN_WARNING "ib_query_device() failed with %d\n", ret);
-               return ret;
-       }
-
-       ipoib_dbg(priv, "max_srq_sge=%d\n", attr.max_srq_sge);
+       ipoib_dbg(priv, "max_srq_sge=%d\n", priv->ca->attrs.max_srq_sge);
 
-       attr.max_srq_sge = min_t(int, IPOIB_CM_RX_SG, attr.max_srq_sge);
-       ipoib_cm_create_srq(dev, attr.max_srq_sge);
+       max_srq_sge = min_t(int, IPOIB_CM_RX_SG, priv->ca->attrs.max_srq_sge);
+       ipoib_cm_create_srq(dev, max_srq_sge);
        if (ipoib_cm_has_srq(dev)) {
-               priv->cm.max_cm_mtu = attr.max_srq_sge * PAGE_SIZE - 0x10;
-               priv->cm.num_frags  = attr.max_srq_sge;
+               priv->cm.max_cm_mtu = max_srq_sge * PAGE_SIZE - 0x10;
+               priv->cm.num_frags  = max_srq_sge;
                ipoib_dbg(priv, "max_cm_mtu = 0x%x, num_frags=%d\n",
                          priv->cm.max_cm_mtu, priv->cm.num_frags);
        } else {
index 078cadd6c797afeb0e22267fb76e5362a4b97326..a53fa5fc0dec7d65900cce1133b2d4e2326183cc 100644 (file)
@@ -40,15 +40,11 @@ static void ipoib_get_drvinfo(struct net_device *netdev,
                              struct ethtool_drvinfo *drvinfo)
 {
        struct ipoib_dev_priv *priv = netdev_priv(netdev);
-       struct ib_device_attr *attr;
-
-       attr = kmalloc(sizeof(*attr), GFP_KERNEL);
-       if (attr && !ib_query_device(priv->ca, attr))
-               snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
-                        "%d.%d.%d", (int)(attr->fw_ver >> 32),
-                        (int)(attr->fw_ver >> 16) & 0xffff,
-                        (int)attr->fw_ver & 0xffff);
-       kfree(attr);
+
+       snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
+                "%d.%d.%d", (int)(priv->ca->attrs.fw_ver >> 32),
+                (int)(priv->ca->attrs.fw_ver >> 16) & 0xffff,
+                (int)priv->ca->attrs.fw_ver & 0xffff);
 
        strlcpy(drvinfo->bus_info, dev_name(priv->ca->dma_device),
                sizeof(drvinfo->bus_info));
index 7d3281866ffcd520d4bca543a9ad0544f2ec159f..25509bbd4a050a076dba5d8bd88b718fa1a72778 100644 (file)
@@ -1150,8 +1150,6 @@ static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv)
        unsigned long flags;
        int i;
        LIST_HEAD(remove_list);
-       struct ipoib_mcast *mcast, *tmcast;
-       struct net_device *dev = priv->dev;
 
        if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
                return;
@@ -1179,18 +1177,8 @@ static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv)
                                                          lockdep_is_held(&priv->lock))) != NULL) {
                        /* was the neigh idle for two GC periods */
                        if (time_after(neigh_obsolete, neigh->alive)) {
-                               u8 *mgid = neigh->daddr + 4;
 
-                               /* Is this multicast ? */
-                               if (*mgid == 0xff) {
-                                       mcast = __ipoib_mcast_find(dev, mgid);
-
-                                       if (mcast && test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
-                                               list_del(&mcast->list);
-                                               rb_erase(&mcast->rb_node, &priv->multicast_tree);
-                                               list_add_tail(&mcast->list, &remove_list);
-                                       }
-                               }
+                               ipoib_check_and_add_mcast_sendonly(priv, neigh->daddr + 4, &remove_list);
 
                                rcu_assign_pointer(*np,
                                                   rcu_dereference_protected(neigh->hnext,
@@ -1207,10 +1195,7 @@ static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv)
 
 out_unlock:
        spin_unlock_irqrestore(&priv->lock, flags);
-       list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
-               ipoib_mcast_leave(dev, mcast);
-               ipoib_mcast_free(mcast);
-       }
+       ipoib_mcast_remove_list(&remove_list);
 }
 
 static void ipoib_reap_neigh(struct work_struct *work)
@@ -1777,26 +1762,7 @@ int ipoib_add_pkey_attr(struct net_device *dev)
 
 int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca)
 {
-       struct ib_device_attr *device_attr;
-       int result = -ENOMEM;
-
-       device_attr = kmalloc(sizeof *device_attr, GFP_KERNEL);
-       if (!device_attr) {
-               printk(KERN_WARNING "%s: allocation of %zu bytes failed\n",
-                      hca->name, sizeof *device_attr);
-               return result;
-       }
-
-       result = ib_query_device(hca, device_attr);
-       if (result) {
-               printk(KERN_WARNING "%s: ib_query_device failed (ret = %d)\n",
-                      hca->name, result);
-               kfree(device_attr);
-               return result;
-       }
-       priv->hca_caps = device_attr->device_cap_flags;
-
-       kfree(device_attr);
+       priv->hca_caps = hca->attrs.device_cap_flags;
 
        if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {
                priv->dev->hw_features = NETIF_F_SG |
index f357ca67a41cd859b2ff5e704621f72813b91fa8..050dfa175d169dd3f77ee83767f1988062989891 100644 (file)
@@ -106,7 +106,7 @@ static void __ipoib_mcast_schedule_join_thread(struct ipoib_dev_priv *priv,
                queue_delayed_work(priv->wq, &priv->mcast_task, 0);
 }
 
-void ipoib_mcast_free(struct ipoib_mcast *mcast)
+static void ipoib_mcast_free(struct ipoib_mcast *mcast)
 {
        struct net_device *dev = mcast->dev;
        int tx_dropped = 0;
@@ -153,7 +153,7 @@ static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev,
        return mcast;
 }
 
-struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, void *mgid)
+static struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, void *mgid)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
        struct rb_node *n = priv->multicast_tree.rb_node;
@@ -677,7 +677,7 @@ int ipoib_mcast_stop_thread(struct net_device *dev)
        return 0;
 }
 
-int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
+static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
        int ret = 0;
@@ -704,6 +704,35 @@ int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
        return 0;
 }
 
+/*
+ * Check if the multicast group is sendonly. If so remove it from the maps
+ * and add to the remove list
+ */
+void ipoib_check_and_add_mcast_sendonly(struct ipoib_dev_priv *priv, u8 *mgid,
+                               struct list_head *remove_list)
+{
+       /* Is this multicast ? */
+       if (*mgid == 0xff) {
+               struct ipoib_mcast *mcast = __ipoib_mcast_find(priv->dev, mgid);
+
+               if (mcast && test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
+                       list_del(&mcast->list);
+                       rb_erase(&mcast->rb_node, &priv->multicast_tree);
+                       list_add_tail(&mcast->list, remove_list);
+               }
+       }
+}
+
+void ipoib_mcast_remove_list(struct list_head *remove_list)
+{
+       struct ipoib_mcast *mcast, *tmcast;
+
+       list_for_each_entry_safe(mcast, tmcast, remove_list, list) {
+               ipoib_mcast_leave(mcast->dev, mcast);
+               ipoib_mcast_free(mcast);
+       }
+}
+
 void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -810,10 +839,7 @@ void ipoib_mcast_dev_flush(struct net_device *dev)
                if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
                        wait_for_completion(&mcast->done);
 
-       list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
-               ipoib_mcast_leave(dev, mcast);
-               ipoib_mcast_free(mcast);
-       }
+       ipoib_mcast_remove_list(&remove_list);
 }
 
 static int ipoib_mcast_addr_is_valid(const u8 *addr, const u8 *broadcast)
@@ -939,10 +965,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
                if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
                        wait_for_completion(&mcast->done);
 
-       list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
-               ipoib_mcast_leave(mcast->dev, mcast);
-               ipoib_mcast_free(mcast);
-       }
+       ipoib_mcast_remove_list(&remove_list);
 
        /*
         * Double check that we are still up
index 9080161e01af1614afa5796a134d7fd890a1be45..c827c93f46c547ce7ec810a2636723a7f3b582a8 100644 (file)
@@ -644,7 +644,7 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep,
 
                ib_conn = &iser_conn->ib_conn;
                if (ib_conn->pi_support) {
-                       u32 sig_caps = ib_conn->device->dev_attr.sig_prot_cap;
+                       u32 sig_caps = ib_conn->device->ib_device->attrs.sig_prot_cap;
 
                        scsi_host_set_prot(shost, iser_dif_prot_caps(sig_caps));
                        scsi_host_set_guard(shost, SHOST_DIX_GUARD_IP |
@@ -656,7 +656,7 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep,
                 * max fastreg page list length.
                 */
                shost->sg_tablesize = min_t(unsigned short, shost->sg_tablesize,
-                       ib_conn->device->dev_attr.max_fast_reg_page_list_len);
+                       ib_conn->device->ib_device->attrs.max_fast_reg_page_list_len);
                shost->max_sectors = min_t(unsigned int,
                        1024, (shost->sg_tablesize * PAGE_SIZE) >> 9);
 
@@ -1059,7 +1059,8 @@ static int __init iser_init(void)
        release_wq = alloc_workqueue("release workqueue", 0, 0);
        if (!release_wq) {
                iser_err("failed to allocate release workqueue\n");
-               return -ENOMEM;
+               err = -ENOMEM;
+               goto err_alloc_wq;
        }
 
        iscsi_iser_scsi_transport = iscsi_register_transport(
@@ -1067,12 +1068,14 @@ static int __init iser_init(void)
        if (!iscsi_iser_scsi_transport) {
                iser_err("iscsi_register_transport failed\n");
                err = -EINVAL;
-               goto register_transport_failure;
+               goto err_reg;
        }
 
        return 0;
 
-register_transport_failure:
+err_reg:
+       destroy_workqueue(release_wq);
+err_alloc_wq:
        kmem_cache_destroy(ig.desc_cache);
 
        return err;
index 8a5998e6a407997906e45864ac9799fdac468084..95f0a64e076b9addc5b01fc17e268f211c9e6f04 100644 (file)
@@ -48,6 +48,7 @@
 #include <scsi/scsi_transport_iscsi.h>
 #include <scsi/scsi_cmnd.h>
 #include <scsi/scsi_device.h>
+#include <scsi/iser.h>
 
 #include <linux/interrupt.h>
 #include <linux/wait.h>
                                         - ISER_MAX_RX_MISC_PDUS) /     \
                                         (1 + ISER_INFLIGHT_DATAOUTS))
 
-#define ISER_WC_BATCH_COUNT   16
 #define ISER_SIGNAL_CMD_COUNT 32
 
-#define ISER_VER                       0x10
-#define ISER_WSV                       0x08
-#define ISER_RSV                       0x04
-
-#define ISER_FASTREG_LI_WRID           0xffffffffffffffffULL
-#define ISER_BEACON_WRID               0xfffffffffffffffeULL
-
-/**
- * struct iser_hdr - iSER header
- *
- * @flags:        flags support (zbva, remote_inv)
- * @rsvd:         reserved
- * @write_stag:   write rkey
- * @write_va:     write virtual address
- * @reaf_stag:    read rkey
- * @read_va:      read virtual address
- */
-struct iser_hdr {
-       u8      flags;
-       u8      rsvd[3];
-       __be32  write_stag;
-       __be64  write_va;
-       __be32  read_stag;
-       __be64  read_va;
-} __attribute__((packed));
-
-
-#define ISER_ZBVA_NOT_SUPPORTED                0x80
-#define ISER_SEND_W_INV_NOT_SUPPORTED  0x40
-
-struct iser_cm_hdr {
-       u8      flags;
-       u8      rsvd[3];
-} __packed;
-
 /* Constant PDU lengths calculations */
-#define ISER_HEADERS_LEN  (sizeof(struct iser_hdr) + sizeof(struct iscsi_hdr))
+#define ISER_HEADERS_LEN       (sizeof(struct iser_ctrl) + sizeof(struct iscsi_hdr))
 
 #define ISER_RECV_DATA_SEG_LEN 128
 #define ISER_RX_PAYLOAD_SIZE   (ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN)
@@ -269,7 +234,7 @@ enum iser_desc_type {
 #define ISER_MAX_WRS 7
 
 /**
- * struct iser_tx_desc - iSER TX descriptor (for send wr_id)
+ * struct iser_tx_desc - iSER TX descriptor
  *
  * @iser_header:   iser header
  * @iscsi_header:  iscsi header
@@ -287,12 +252,13 @@ enum iser_desc_type {
  * @sig_attrs:     Signature attributes
  */
 struct iser_tx_desc {
-       struct iser_hdr              iser_header;
+       struct iser_ctrl             iser_header;
        struct iscsi_hdr             iscsi_header;
        enum   iser_desc_type        type;
        u64                          dma_addr;
        struct ib_sge                tx_sg[2];
        int                          num_sge;
+       struct ib_cqe                cqe;
        bool                         mapped;
        u8                           wr_idx;
        union iser_wr {
@@ -306,9 +272,10 @@ struct iser_tx_desc {
 };
 
 #define ISER_RX_PAD_SIZE       (256 - (ISER_RX_PAYLOAD_SIZE + \
-                                       sizeof(u64) + sizeof(struct ib_sge)))
+                                sizeof(u64) + sizeof(struct ib_sge) + \
+                                sizeof(struct ib_cqe)))
 /**
- * struct iser_rx_desc - iSER RX descriptor (for recv wr_id)
+ * struct iser_rx_desc - iSER RX descriptor
  *
  * @iser_header:   iser header
  * @iscsi_header:  iscsi header
@@ -318,12 +285,32 @@ struct iser_tx_desc {
  * @pad:           for sense data TODO: Modify to maximum sense length supported
  */
 struct iser_rx_desc {
-       struct iser_hdr              iser_header;
+       struct iser_ctrl             iser_header;
        struct iscsi_hdr             iscsi_header;
        char                         data[ISER_RECV_DATA_SEG_LEN];
        u64                          dma_addr;
        struct ib_sge                rx_sg;
+       struct ib_cqe                cqe;
        char                         pad[ISER_RX_PAD_SIZE];
+} __packed;
+
+/**
+ * struct iser_login_desc - iSER login descriptor
+ *
+ * @req:           pointer to login request buffer
+ * @resp:          pointer to login response buffer
+ * @req_dma:       DMA address of login request buffer
+ * @rsp_dma:      DMA address of login response buffer
+ * @sge:           IB sge for login post recv
+ * @cqe:           completion handler
+ */
+struct iser_login_desc {
+       void                         *req;
+       void                         *rsp;
+       u64                          req_dma;
+       u64                          rsp_dma;
+       struct ib_sge                sge;
+       struct ib_cqe                cqe;
 } __attribute__((packed));
 
 struct iser_conn;
@@ -333,18 +320,12 @@ struct iscsi_iser_task;
 /**
  * struct iser_comp - iSER completion context
  *
- * @device:     pointer to device handle
  * @cq:         completion queue
- * @wcs:        work completion array
- * @tasklet:    Tasklet handle
  * @active_qps: Number of active QPs attached
  *              to completion context
  */
 struct iser_comp {
-       struct iser_device      *device;
        struct ib_cq            *cq;
-       struct ib_wc             wcs[ISER_WC_BATCH_COUNT];
-       struct tasklet_struct    tasklet;
        int                      active_qps;
 };
 
@@ -380,7 +361,6 @@ struct iser_reg_ops {
  *
  * @ib_device:     RDMA device
  * @pd:            Protection Domain for this device
- * @dev_attr:      Device attributes container
  * @mr:            Global DMA memory region
  * @event_handler: IB events handle routine
  * @ig_list:      entry in devices list
@@ -389,18 +369,19 @@ struct iser_reg_ops {
  *                 cpus and device max completion vectors
  * @comps:         Dinamically allocated array of completion handlers
  * @reg_ops:       Registration ops
+ * @remote_inv_sup: Remote invalidate is supported on this device
  */
 struct iser_device {
        struct ib_device             *ib_device;
        struct ib_pd                 *pd;
-       struct ib_device_attr        dev_attr;
        struct ib_mr                 *mr;
        struct ib_event_handler      event_handler;
        struct list_head             ig_list;
        int                          refcount;
        int                          comps_used;
        struct iser_comp             *comps;
-       struct iser_reg_ops          *reg_ops;
+       const struct iser_reg_ops    *reg_ops;
+       bool                         remote_inv_sup;
 };
 
 #define ISER_CHECK_GUARD       0xc0
@@ -475,10 +456,11 @@ struct iser_fr_pool {
  * @rx_wr:               receive work request for batch posts
  * @device:              reference to iser device
  * @comp:                iser completion context
- * @pi_support:          Indicate device T10-PI support
- * @beacon:              beacon send wr to signal all flush errors were drained
- * @flush_comp:          completes when all connection completions consumed
  * @fr_pool:             connection fast registration poool
+ * @pi_support:          Indicate device T10-PI support
+ * @last:                last send wr to signal all flush errors were drained
+ * @last_cqe:            cqe handler for last wr
+ * @last_comp:           completes when all connection completions consumed
  */
 struct ib_conn {
        struct rdma_cm_id           *cma_id;
@@ -488,10 +470,12 @@ struct ib_conn {
        struct ib_recv_wr            rx_wr[ISER_MIN_POSTED_RX];
        struct iser_device          *device;
        struct iser_comp            *comp;
-       bool                         pi_support;
-       struct ib_send_wr            beacon;
-       struct completion            flush_comp;
        struct iser_fr_pool          fr_pool;
+       bool                         pi_support;
+       struct ib_send_wr            last;
+       struct ib_cqe                last_cqe;
+       struct ib_cqe                reg_cqe;
+       struct completion            last_comp;
 };
 
 /**
@@ -514,11 +498,7 @@ struct ib_conn {
  * @up_completion:    connection establishment completed
  *                    (state is ISER_CONN_UP)
  * @conn_list:        entry in ig conn list
- * @login_buf:        login data buffer (stores login parameters)
- * @login_req_buf:    login request buffer
- * @login_req_dma:    login request buffer dma address
- * @login_resp_buf:   login response buffer
- * @login_resp_dma:   login response buffer dma address
+ * @login_desc:       login descriptor
  * @rx_desc_head:     head of rx_descs cyclic buffer
  * @rx_descs:         rx buffers array (cyclic buffer)
  * @num_rx_descs:     number of rx descriptors
@@ -541,15 +521,13 @@ struct iser_conn {
        struct completion            ib_completion;
        struct completion            up_completion;
        struct list_head             conn_list;
-
-       char                         *login_buf;
-       char                         *login_req_buf, *login_resp_buf;
-       u64                          login_req_dma, login_resp_dma;
+       struct iser_login_desc       login_desc;
        unsigned int                 rx_desc_head;
        struct iser_rx_desc          *rx_descs;
        u32                          num_rx_descs;
        unsigned short               scsi_sg_tablesize;
        unsigned int                 scsi_max_sectors;
+       bool                         snd_w_inv;
 };
 
 /**
@@ -579,9 +557,8 @@ struct iscsi_iser_task {
 
 struct iser_page_vec {
        u64 *pages;
-       int length;
-       int offset;
-       int data_size;
+       int npages;
+       struct ib_mr fake_mr;
 };
 
 /**
@@ -633,12 +610,14 @@ int iser_conn_terminate(struct iser_conn *iser_conn);
 
 void iser_release_work(struct work_struct *work);
 
-void iser_rcv_completion(struct iser_rx_desc *desc,
-                        unsigned long dto_xfer_len,
-                        struct ib_conn *ib_conn);
-
-void iser_snd_completion(struct iser_tx_desc *desc,
-                        struct ib_conn *ib_conn);
+void iser_err_comp(struct ib_wc *wc, const char *type);
+void iser_login_rsp(struct ib_cq *cq, struct ib_wc *wc);
+void iser_task_rsp(struct ib_cq *cq, struct ib_wc *wc);
+void iser_cmd_comp(struct ib_cq *cq, struct ib_wc *wc);
+void iser_ctrl_comp(struct ib_cq *cq, struct ib_wc *wc);
+void iser_dataout_comp(struct ib_cq *cq, struct ib_wc *wc);
+void iser_reg_comp(struct ib_cq *cq, struct ib_wc *wc);
+void iser_last_comp(struct ib_cq *cq, struct ib_wc *wc);
 
 void iser_task_rdma_init(struct iscsi_iser_task *task);
 
@@ -651,7 +630,8 @@ void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task,
                                     enum iser_data_dir cmd_dir);
 
 int iser_reg_rdma_mem(struct iscsi_iser_task *task,
-                     enum iser_data_dir dir);
+                     enum iser_data_dir dir,
+                     bool all_imm);
 void iser_unreg_rdma_mem(struct iscsi_iser_task *task,
                         enum iser_data_dir dir);
 
@@ -719,4 +699,28 @@ iser_tx_next_wr(struct iser_tx_desc *tx_desc)
        return cur_wr;
 }
 
+static inline struct iser_conn *
+to_iser_conn(struct ib_conn *ib_conn)
+{
+       return container_of(ib_conn, struct iser_conn, ib_conn);
+}
+
+static inline struct iser_rx_desc *
+iser_rx(struct ib_cqe *cqe)
+{
+       return container_of(cqe, struct iser_rx_desc, cqe);
+}
+
+static inline struct iser_tx_desc *
+iser_tx(struct ib_cqe *cqe)
+{
+       return container_of(cqe, struct iser_tx_desc, cqe);
+}
+
+static inline struct iser_login_desc *
+iser_login(struct ib_cqe *cqe)
+{
+       return container_of(cqe, struct iser_login_desc, cqe);
+}
+
 #endif
index ffd00c42072959ed6747b5094e159cb34532675f..ed54b388e7adca899bfaec181c2f4c2ba75b1f47 100644 (file)
@@ -51,7 +51,7 @@ static int iser_prepare_read_cmd(struct iscsi_task *task)
        struct iscsi_iser_task *iser_task = task->dd_data;
        struct iser_mem_reg *mem_reg;
        int err;
-       struct iser_hdr *hdr = &iser_task->desc.iser_header;
+       struct iser_ctrl *hdr = &iser_task->desc.iser_header;
        struct iser_data_buf *buf_in = &iser_task->data[ISER_DIR_IN];
 
        err = iser_dma_map_task_data(iser_task,
@@ -72,7 +72,7 @@ static int iser_prepare_read_cmd(struct iscsi_task *task)
                        return err;
        }
 
-       err = iser_reg_rdma_mem(iser_task, ISER_DIR_IN);
+       err = iser_reg_rdma_mem(iser_task, ISER_DIR_IN, false);
        if (err) {
                iser_err("Failed to set up Data-IN RDMA\n");
                return err;
@@ -104,7 +104,7 @@ iser_prepare_write_cmd(struct iscsi_task *task,
        struct iscsi_iser_task *iser_task = task->dd_data;
        struct iser_mem_reg *mem_reg;
        int err;
-       struct iser_hdr *hdr = &iser_task->desc.iser_header;
+       struct iser_ctrl *hdr = &iser_task->desc.iser_header;
        struct iser_data_buf *buf_out = &iser_task->data[ISER_DIR_OUT];
        struct ib_sge *tx_dsg = &iser_task->desc.tx_sg[1];
 
@@ -126,7 +126,8 @@ iser_prepare_write_cmd(struct iscsi_task *task,
                        return err;
        }
 
-       err = iser_reg_rdma_mem(iser_task, ISER_DIR_OUT);
+       err = iser_reg_rdma_mem(iser_task, ISER_DIR_OUT,
+                               buf_out->data_len == imm_sz);
        if (err != 0) {
                iser_err("Failed to register write cmd RDMA mem\n");
                return err;
@@ -166,7 +167,7 @@ static void iser_create_send_desc(struct iser_conn  *iser_conn,
        ib_dma_sync_single_for_cpu(device->ib_device,
                tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE);
 
-       memset(&tx_desc->iser_header, 0, sizeof(struct iser_hdr));
+       memset(&tx_desc->iser_header, 0, sizeof(struct iser_ctrl));
        tx_desc->iser_header.flags = ISER_VER;
        tx_desc->num_sge = 1;
 }
@@ -174,73 +175,63 @@ static void iser_create_send_desc(struct iser_conn        *iser_conn,
 static void iser_free_login_buf(struct iser_conn *iser_conn)
 {
        struct iser_device *device = iser_conn->ib_conn.device;
+       struct iser_login_desc *desc = &iser_conn->login_desc;
 
-       if (!iser_conn->login_buf)
+       if (!desc->req)
                return;
 
-       if (iser_conn->login_req_dma)
-               ib_dma_unmap_single(device->ib_device,
-                                   iser_conn->login_req_dma,
-                                   ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_TO_DEVICE);
+       ib_dma_unmap_single(device->ib_device, desc->req_dma,
+                           ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_TO_DEVICE);
 
-       if (iser_conn->login_resp_dma)
-               ib_dma_unmap_single(device->ib_device,
-                                   iser_conn->login_resp_dma,
-                                   ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE);
+       ib_dma_unmap_single(device->ib_device, desc->rsp_dma,
+                           ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE);
 
-       kfree(iser_conn->login_buf);
+       kfree(desc->req);
+       kfree(desc->rsp);
 
        /* make sure we never redo any unmapping */
-       iser_conn->login_req_dma = 0;
-       iser_conn->login_resp_dma = 0;
-       iser_conn->login_buf = NULL;
+       desc->req = NULL;
+       desc->rsp = NULL;
 }
 
 static int iser_alloc_login_buf(struct iser_conn *iser_conn)
 {
        struct iser_device *device = iser_conn->ib_conn.device;
-       int                     req_err, resp_err;
-
-       BUG_ON(device == NULL);
-
-       iser_conn->login_buf = kmalloc(ISCSI_DEF_MAX_RECV_SEG_LEN +
-                                    ISER_RX_LOGIN_SIZE, GFP_KERNEL);
-       if (!iser_conn->login_buf)
-               goto out_err;
-
-       iser_conn->login_req_buf  = iser_conn->login_buf;
-       iser_conn->login_resp_buf = iser_conn->login_buf +
-                                               ISCSI_DEF_MAX_RECV_SEG_LEN;
-
-       iser_conn->login_req_dma = ib_dma_map_single(device->ib_device,
-                                                    iser_conn->login_req_buf,
-                                                    ISCSI_DEF_MAX_RECV_SEG_LEN,
-                                                    DMA_TO_DEVICE);
-
-       iser_conn->login_resp_dma = ib_dma_map_single(device->ib_device,
-                                                     iser_conn->login_resp_buf,
-                                                     ISER_RX_LOGIN_SIZE,
-                                                     DMA_FROM_DEVICE);
-
-       req_err  = ib_dma_mapping_error(device->ib_device,
-                                       iser_conn->login_req_dma);
-       resp_err = ib_dma_mapping_error(device->ib_device,
-                                       iser_conn->login_resp_dma);
-
-       if (req_err || resp_err) {
-               if (req_err)
-                       iser_conn->login_req_dma = 0;
-               if (resp_err)
-                       iser_conn->login_resp_dma = 0;
-               goto free_login_buf;
-       }
+       struct iser_login_desc *desc = &iser_conn->login_desc;
+
+       desc->req = kmalloc(ISCSI_DEF_MAX_RECV_SEG_LEN, GFP_KERNEL);
+       if (!desc->req)
+               return -ENOMEM;
+
+       desc->req_dma = ib_dma_map_single(device->ib_device, desc->req,
+                                         ISCSI_DEF_MAX_RECV_SEG_LEN,
+                                         DMA_TO_DEVICE);
+       if (ib_dma_mapping_error(device->ib_device,
+                               desc->req_dma))
+               goto free_req;
+
+       desc->rsp = kmalloc(ISER_RX_LOGIN_SIZE, GFP_KERNEL);
+       if (!desc->rsp)
+               goto unmap_req;
+
+       desc->rsp_dma = ib_dma_map_single(device->ib_device, desc->rsp,
+                                          ISER_RX_LOGIN_SIZE,
+                                          DMA_FROM_DEVICE);
+       if (ib_dma_mapping_error(device->ib_device,
+                               desc->rsp_dma))
+               goto free_rsp;
+
        return 0;
 
-free_login_buf:
-       iser_free_login_buf(iser_conn);
+free_rsp:
+       kfree(desc->rsp);
+unmap_req:
+       ib_dma_unmap_single(device->ib_device, desc->req_dma,
+                           ISCSI_DEF_MAX_RECV_SEG_LEN,
+                           DMA_TO_DEVICE);
+free_req:
+       kfree(desc->req);
 
-out_err:
-       iser_err("unable to alloc or map login buf\n");
        return -ENOMEM;
 }
 
@@ -280,11 +271,11 @@ int iser_alloc_rx_descriptors(struct iser_conn *iser_conn,
                        goto rx_desc_dma_map_failed;
 
                rx_desc->dma_addr = dma_addr;
-
+               rx_desc->cqe.done = iser_task_rsp;
                rx_sg = &rx_desc->rx_sg;
-               rx_sg->addr   = rx_desc->dma_addr;
+               rx_sg->addr = rx_desc->dma_addr;
                rx_sg->length = ISER_RX_PAYLOAD_SIZE;
-               rx_sg->lkey   = device->pd->local_dma_lkey;
+               rx_sg->lkey = device->pd->local_dma_lkey;
        }
 
        iser_conn->rx_desc_head = 0;
@@ -383,6 +374,7 @@ int iser_send_command(struct iscsi_conn *conn,
 
        /* build the tx desc regd header and add it to the tx desc dto */
        tx_desc->type = ISCSI_TX_SCSI_COMMAND;
+       tx_desc->cqe.done = iser_cmd_comp;
        iser_create_send_desc(iser_conn, tx_desc);
 
        if (hdr->flags & ISCSI_FLAG_CMD_READ) {
@@ -464,6 +456,7 @@ int iser_send_data_out(struct iscsi_conn *conn,
        }
 
        tx_desc->type = ISCSI_TX_DATAOUT;
+       tx_desc->cqe.done = iser_dataout_comp;
        tx_desc->iser_header.flags = ISER_VER;
        memcpy(&tx_desc->iscsi_header, hdr, sizeof(struct iscsi_hdr));
 
@@ -513,6 +506,7 @@ int iser_send_control(struct iscsi_conn *conn,
 
        /* build the tx desc regd header and add it to the tx desc dto */
        mdesc->type = ISCSI_TX_CONTROL;
+       mdesc->cqe.done = iser_ctrl_comp;
        iser_create_send_desc(iser_conn, mdesc);
 
        device = iser_conn->ib_conn.device;
@@ -520,25 +514,25 @@ int iser_send_control(struct iscsi_conn *conn,
        data_seg_len = ntoh24(task->hdr->dlength);
 
        if (data_seg_len > 0) {
+               struct iser_login_desc *desc = &iser_conn->login_desc;
                struct ib_sge *tx_dsg = &mdesc->tx_sg[1];
+
                if (task != conn->login_task) {
                        iser_err("data present on non login task!!!\n");
                        goto send_control_error;
                }
 
-               ib_dma_sync_single_for_cpu(device->ib_device,
-                       iser_conn->login_req_dma, task->data_count,
-                       DMA_TO_DEVICE);
+               ib_dma_sync_single_for_cpu(device->ib_device, desc->req_dma,
+                                          task->data_count, DMA_TO_DEVICE);
 
-               memcpy(iser_conn->login_req_buf, task->data, task->data_count);
+               memcpy(desc->req, task->data, task->data_count);
 
-               ib_dma_sync_single_for_device(device->ib_device,
-                       iser_conn->login_req_dma, task->data_count,
-                       DMA_TO_DEVICE);
+               ib_dma_sync_single_for_device(device->ib_device, desc->req_dma,
+                                             task->data_count, DMA_TO_DEVICE);
 
-               tx_dsg->addr    = iser_conn->login_req_dma;
-               tx_dsg->length  = task->data_count;
-               tx_dsg->lkey    = device->pd->local_dma_lkey;
+               tx_dsg->addr = desc->req_dma;
+               tx_dsg->length = task->data_count;
+               tx_dsg->lkey = device->pd->local_dma_lkey;
                mdesc->num_sge = 2;
        }
 
@@ -562,41 +556,126 @@ send_control_error:
        return err;
 }
 
-/**
- * iser_rcv_dto_completion - recv DTO completion
- */
-void iser_rcv_completion(struct iser_rx_desc *rx_desc,
-                        unsigned long rx_xfer_len,
-                        struct ib_conn *ib_conn)
+void iser_login_rsp(struct ib_cq *cq, struct ib_wc *wc)
 {
-       struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
-                                                  ib_conn);
+       struct ib_conn *ib_conn = wc->qp->qp_context;
+       struct iser_conn *iser_conn = to_iser_conn(ib_conn);
+       struct iser_login_desc *desc = iser_login(wc->wr_cqe);
        struct iscsi_hdr *hdr;
-       u64 rx_dma;
-       int rx_buflen, outstanding, count, err;
+       char *data;
+       int length;
 
-       /* differentiate between login to all other PDUs */
-       if ((char *)rx_desc == iser_conn->login_resp_buf) {
-               rx_dma = iser_conn->login_resp_dma;
-               rx_buflen = ISER_RX_LOGIN_SIZE;
-       } else {
-               rx_dma = rx_desc->dma_addr;
-               rx_buflen = ISER_RX_PAYLOAD_SIZE;
+       if (unlikely(wc->status != IB_WC_SUCCESS)) {
+               iser_err_comp(wc, "login_rsp");
+               return;
+       }
+
+       ib_dma_sync_single_for_cpu(ib_conn->device->ib_device,
+                                  desc->rsp_dma, ISER_RX_LOGIN_SIZE,
+                                  DMA_FROM_DEVICE);
+
+       hdr = desc->rsp + sizeof(struct iser_ctrl);
+       data = desc->rsp + ISER_HEADERS_LEN;
+       length = wc->byte_len - ISER_HEADERS_LEN;
+
+       iser_dbg("op 0x%x itt 0x%x dlen %d\n", hdr->opcode,
+                hdr->itt, length);
+
+       iscsi_iser_recv(iser_conn->iscsi_conn, hdr, data, length);
+
+       ib_dma_sync_single_for_device(ib_conn->device->ib_device,
+                                     desc->rsp_dma, ISER_RX_LOGIN_SIZE,
+                                     DMA_FROM_DEVICE);
+
+       ib_conn->post_recv_buf_count--;
+}
+
+static inline void
+iser_inv_desc(struct iser_fr_desc *desc, u32 rkey)
+{
+       if (likely(rkey == desc->rsc.mr->rkey))
+               desc->rsc.mr_valid = 0;
+       else if (likely(rkey == desc->pi_ctx->sig_mr->rkey))
+               desc->pi_ctx->sig_mr_valid = 0;
+}
+
+static int
+iser_check_remote_inv(struct iser_conn *iser_conn,
+                     struct ib_wc *wc,
+                     struct iscsi_hdr *hdr)
+{
+       if (wc->wc_flags & IB_WC_WITH_INVALIDATE) {
+               struct iscsi_task *task;
+               u32 rkey = wc->ex.invalidate_rkey;
+
+               iser_dbg("conn %p: remote invalidation for rkey %#x\n",
+                        iser_conn, rkey);
+
+               if (unlikely(!iser_conn->snd_w_inv)) {
+                       iser_err("conn %p: unexepected remote invalidation, "
+                                "terminating connection\n", iser_conn);
+                       return -EPROTO;
+               }
+
+               task = iscsi_itt_to_ctask(iser_conn->iscsi_conn, hdr->itt);
+               if (likely(task)) {
+                       struct iscsi_iser_task *iser_task = task->dd_data;
+                       struct iser_fr_desc *desc;
+
+                       if (iser_task->dir[ISER_DIR_IN]) {
+                               desc = iser_task->rdma_reg[ISER_DIR_IN].mem_h;
+                               iser_inv_desc(desc, rkey);
+                       }
+
+                       if (iser_task->dir[ISER_DIR_OUT]) {
+                               desc = iser_task->rdma_reg[ISER_DIR_OUT].mem_h;
+                               iser_inv_desc(desc, rkey);
+                       }
+               } else {
+                       iser_err("failed to get task for itt=%d\n", hdr->itt);
+                       return -EINVAL;
+               }
        }
 
-       ib_dma_sync_single_for_cpu(ib_conn->device->ib_device, rx_dma,
-                                  rx_buflen, DMA_FROM_DEVICE);
+       return 0;
+}
 
-       hdr = &rx_desc->iscsi_header;
+
+void iser_task_rsp(struct ib_cq *cq, struct ib_wc *wc)
+{
+       struct ib_conn *ib_conn = wc->qp->qp_context;
+       struct iser_conn *iser_conn = to_iser_conn(ib_conn);
+       struct iser_rx_desc *desc = iser_rx(wc->wr_cqe);
+       struct iscsi_hdr *hdr;
+       int length;
+       int outstanding, count, err;
+
+       if (unlikely(wc->status != IB_WC_SUCCESS)) {
+               iser_err_comp(wc, "task_rsp");
+               return;
+       }
+
+       ib_dma_sync_single_for_cpu(ib_conn->device->ib_device,
+                                  desc->dma_addr, ISER_RX_PAYLOAD_SIZE,
+                                  DMA_FROM_DEVICE);
+
+       hdr = &desc->iscsi_header;
+       length = wc->byte_len - ISER_HEADERS_LEN;
 
        iser_dbg("op 0x%x itt 0x%x dlen %d\n", hdr->opcode,
-                       hdr->itt, (int)(rx_xfer_len - ISER_HEADERS_LEN));
+                hdr->itt, length);
+
+       if (iser_check_remote_inv(iser_conn, wc, hdr)) {
+               iscsi_conn_failure(iser_conn->iscsi_conn,
+                                  ISCSI_ERR_CONN_FAILED);
+               return;
+       }
 
-       iscsi_iser_recv(iser_conn->iscsi_conn, hdr, rx_desc->data,
-                       rx_xfer_len - ISER_HEADERS_LEN);
+       iscsi_iser_recv(iser_conn->iscsi_conn, hdr, desc->data, length);
 
-       ib_dma_sync_single_for_device(ib_conn->device->ib_device, rx_dma,
-                                     rx_buflen, DMA_FROM_DEVICE);
+       ib_dma_sync_single_for_device(ib_conn->device->ib_device,
+                                     desc->dma_addr, ISER_RX_PAYLOAD_SIZE,
+                                     DMA_FROM_DEVICE);
 
        /* decrementing conn->post_recv_buf_count only --after-- freeing the   *
         * task eliminates the need to worry on tasks which are completed in   *
@@ -604,9 +683,6 @@ void iser_rcv_completion(struct iser_rx_desc *rx_desc,
         * for the posted rx bufs refcount to become zero handles everything   */
        ib_conn->post_recv_buf_count--;
 
-       if (rx_dma == iser_conn->login_resp_dma)
-               return;
-
        outstanding = ib_conn->post_recv_buf_count;
        if (outstanding + iser_conn->min_posted_rx <= iser_conn->qp_max_recv_dtos) {
                count = min(iser_conn->qp_max_recv_dtos - outstanding,
@@ -617,26 +693,47 @@ void iser_rcv_completion(struct iser_rx_desc *rx_desc,
        }
 }
 
-void iser_snd_completion(struct iser_tx_desc *tx_desc,
-                       struct ib_conn *ib_conn)
+void iser_cmd_comp(struct ib_cq *cq, struct ib_wc *wc)
 {
+       if (unlikely(wc->status != IB_WC_SUCCESS))
+               iser_err_comp(wc, "command");
+}
+
+void iser_ctrl_comp(struct ib_cq *cq, struct ib_wc *wc)
+{
+       struct iser_tx_desc *desc = iser_tx(wc->wr_cqe);
        struct iscsi_task *task;
-       struct iser_device *device = ib_conn->device;
 
-       if (tx_desc->type == ISCSI_TX_DATAOUT) {
-               ib_dma_unmap_single(device->ib_device, tx_desc->dma_addr,
-                                       ISER_HEADERS_LEN, DMA_TO_DEVICE);
-               kmem_cache_free(ig.desc_cache, tx_desc);
-               tx_desc = NULL;
+       if (unlikely(wc->status != IB_WC_SUCCESS)) {
+               iser_err_comp(wc, "control");
+               return;
        }
 
-       if (tx_desc && tx_desc->type == ISCSI_TX_CONTROL) {
-               /* this arithmetic is legal by libiscsi dd_data allocation */
-               task = (void *) ((long)(void *)tx_desc -
-                                 sizeof(struct iscsi_task));
-               if (task->hdr->itt == RESERVED_ITT)
-                       iscsi_put_task(task);
-       }
+       /* this arithmetic is legal by libiscsi dd_data allocation */
+       task = (void *)desc - sizeof(struct iscsi_task);
+       if (task->hdr->itt == RESERVED_ITT)
+               iscsi_put_task(task);
+}
+
+void iser_dataout_comp(struct ib_cq *cq, struct ib_wc *wc)
+{
+       struct iser_tx_desc *desc = iser_tx(wc->wr_cqe);
+       struct ib_conn *ib_conn = wc->qp->qp_context;
+       struct iser_device *device = ib_conn->device;
+
+       if (unlikely(wc->status != IB_WC_SUCCESS))
+               iser_err_comp(wc, "dataout");
+
+       ib_dma_unmap_single(device->ib_device, desc->dma_addr,
+                           ISER_HEADERS_LEN, DMA_TO_DEVICE);
+       kmem_cache_free(ig.desc_cache, desc);
+}
+
+void iser_last_comp(struct ib_cq *cq, struct ib_wc *wc)
+{
+       struct ib_conn *ib_conn = wc->qp->qp_context;
+
+       complete(&ib_conn->last_comp);
 }
 
 void iser_task_rdma_init(struct iscsi_iser_task *iser_task)
index ea765fb9664d36759ff1a90d11272c1bc13ca457..9a391cc5b9b3f45f16f0a49138aeda5b96030b01 100644 (file)
@@ -49,7 +49,7 @@ int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
                     struct iser_reg_resources *rsc,
                     struct iser_mem_reg *mem_reg);
 
-static struct iser_reg_ops fastreg_ops = {
+static const struct iser_reg_ops fastreg_ops = {
        .alloc_reg_res  = iser_alloc_fastreg_pool,
        .free_reg_res   = iser_free_fastreg_pool,
        .reg_mem        = iser_fast_reg_mr,
@@ -58,7 +58,7 @@ static struct iser_reg_ops fastreg_ops = {
        .reg_desc_put   = iser_reg_desc_put_fr,
 };
 
-static struct iser_reg_ops fmr_ops = {
+static const struct iser_reg_ops fmr_ops = {
        .alloc_reg_res  = iser_alloc_fmr_pool,
        .free_reg_res   = iser_free_fmr_pool,
        .reg_mem        = iser_fast_reg_fmr,
@@ -67,19 +67,24 @@ static struct iser_reg_ops fmr_ops = {
        .reg_desc_put   = iser_reg_desc_put_fmr,
 };
 
+void iser_reg_comp(struct ib_cq *cq, struct ib_wc *wc)
+{
+       iser_err_comp(wc, "memreg");
+}
+
 int iser_assign_reg_ops(struct iser_device *device)
 {
-       struct ib_device_attr *dev_attr = &device->dev_attr;
+       struct ib_device *ib_dev = device->ib_device;
 
        /* Assign function handles  - based on FMR support */
-       if (device->ib_device->alloc_fmr && device->ib_device->dealloc_fmr &&
-           device->ib_device->map_phys_fmr && device->ib_device->unmap_fmr) {
+       if (ib_dev->alloc_fmr && ib_dev->dealloc_fmr &&
+           ib_dev->map_phys_fmr && ib_dev->unmap_fmr) {
                iser_info("FMR supported, using FMR for registration\n");
                device->reg_ops = &fmr_ops;
-       } else
-       if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
+       } else if (ib_dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
                iser_info("FastReg supported, using FastReg for registration\n");
                device->reg_ops = &fastreg_ops;
+               device->remote_inv_sup = iser_always_reg;
        } else {
                iser_err("IB device does not support FMRs nor FastRegs, can't register memory\n");
                return -1;
@@ -131,67 +136,6 @@ iser_reg_desc_put_fmr(struct ib_conn *ib_conn,
 {
 }
 
-#define IS_4K_ALIGNED(addr)    ((((unsigned long)addr) & ~MASK_4K) == 0)
-
-/**
- * iser_sg_to_page_vec - Translates scatterlist entries to physical addresses
- * and returns the length of resulting physical address array (may be less than
- * the original due to possible compaction).
- *
- * we build a "page vec" under the assumption that the SG meets the RDMA
- * alignment requirements. Other then the first and last SG elements, all
- * the "internal" elements can be compacted into a list whose elements are
- * dma addresses of physical pages. The code supports also the weird case
- * where --few fragments of the same page-- are present in the SG as
- * consecutive elements. Also, it handles one entry SG.
- */
-
-static int iser_sg_to_page_vec(struct iser_data_buf *data,
-                              struct ib_device *ibdev, u64 *pages,
-                              int *offset, int *data_size)
-{
-       struct scatterlist *sg, *sgl = data->sg;
-       u64 start_addr, end_addr, page, chunk_start = 0;
-       unsigned long total_sz = 0;
-       unsigned int dma_len;
-       int i, new_chunk, cur_page, last_ent = data->dma_nents - 1;
-
-       /* compute the offset of first element */
-       *offset = (u64) sgl[0].offset & ~MASK_4K;
-
-       new_chunk = 1;
-       cur_page  = 0;
-       for_each_sg(sgl, sg, data->dma_nents, i) {
-               start_addr = ib_sg_dma_address(ibdev, sg);
-               if (new_chunk)
-                       chunk_start = start_addr;
-               dma_len = ib_sg_dma_len(ibdev, sg);
-               end_addr = start_addr + dma_len;
-               total_sz += dma_len;
-
-               /* collect page fragments until aligned or end of SG list */
-               if (!IS_4K_ALIGNED(end_addr) && i < last_ent) {
-                       new_chunk = 0;
-                       continue;
-               }
-               new_chunk = 1;
-
-               /* address of the first page in the contiguous chunk;
-                  masking relevant for the very first SG entry,
-                  which might be unaligned */
-               page = chunk_start & MASK_4K;
-               do {
-                       pages[cur_page++] = page;
-                       page += SIZE_4K;
-               } while (page < end_addr);
-       }
-
-       *data_size = total_sz;
-       iser_dbg("page_vec->data_size:%d cur_page %d\n",
-                *data_size, cur_page);
-       return cur_page;
-}
-
 static void iser_data_buf_dump(struct iser_data_buf *data,
                               struct ib_device *ibdev)
 {
@@ -210,10 +154,10 @@ static void iser_dump_page_vec(struct iser_page_vec *page_vec)
 {
        int i;
 
-       iser_err("page vec length %d data size %d\n",
-                page_vec->length, page_vec->data_size);
-       for (i = 0; i < page_vec->length; i++)
-               iser_err("%d %lx\n",i,(unsigned long)page_vec->pages[i]);
+       iser_err("page vec npages %d data length %d\n",
+                page_vec->npages, page_vec->fake_mr.length);
+       for (i = 0; i < page_vec->npages; i++)
+               iser_err("vec[%d]: %llx\n", i, page_vec->pages[i]);
 }
 
 int iser_dma_map_task_data(struct iscsi_iser_task *iser_task,
@@ -251,7 +195,11 @@ iser_reg_dma(struct iser_device *device, struct iser_data_buf *mem,
        struct scatterlist *sg = mem->sg;
 
        reg->sge.lkey = device->pd->local_dma_lkey;
-       reg->rkey = device->mr->rkey;
+       /*
+        * FIXME: rework the registration code path to differentiate
+        * rkey/lkey use cases
+        */
+       reg->rkey = device->mr ? device->mr->rkey : 0;
        reg->sge.addr = ib_sg_dma_address(device->ib_device, &sg[0]);
        reg->sge.length = ib_sg_dma_len(device->ib_device, &sg[0]);
 
@@ -262,11 +210,16 @@ iser_reg_dma(struct iser_device *device, struct iser_data_buf *mem,
        return 0;
 }
 
-/**
- * iser_reg_page_vec - Register physical memory
- *
- * returns: 0 on success, errno code on failure
- */
+static int iser_set_page(struct ib_mr *mr, u64 addr)
+{
+       struct iser_page_vec *page_vec =
+               container_of(mr, struct iser_page_vec, fake_mr);
+
+       page_vec->pages[page_vec->npages++] = addr;
+
+       return 0;
+}
+
 static
 int iser_fast_reg_fmr(struct iscsi_iser_task *iser_task,
                      struct iser_data_buf *mem,
@@ -280,22 +233,19 @@ int iser_fast_reg_fmr(struct iscsi_iser_task *iser_task,
        struct ib_pool_fmr *fmr;
        int ret, plen;
 
-       plen = iser_sg_to_page_vec(mem, device->ib_device,
-                                  page_vec->pages,
-                                  &page_vec->offset,
-                                  &page_vec->data_size);
-       page_vec->length = plen;
-       if (plen * SIZE_4K < page_vec->data_size) {
+       page_vec->npages = 0;
+       page_vec->fake_mr.page_size = SIZE_4K;
+       plen = ib_sg_to_pages(&page_vec->fake_mr, mem->sg,
+                             mem->size, iser_set_page);
+       if (unlikely(plen < mem->size)) {
                iser_err("page vec too short to hold this SG\n");
                iser_data_buf_dump(mem, device->ib_device);
                iser_dump_page_vec(page_vec);
                return -EINVAL;
        }
 
-       fmr  = ib_fmr_pool_map_phys(fmr_pool,
-                                   page_vec->pages,
-                                   page_vec->length,
-                                   page_vec->pages[0]);
+       fmr  = ib_fmr_pool_map_phys(fmr_pool, page_vec->pages,
+                                   page_vec->npages, page_vec->pages[0]);
        if (IS_ERR(fmr)) {
                ret = PTR_ERR(fmr);
                iser_err("ib_fmr_pool_map_phys failed: %d\n", ret);
@@ -304,8 +254,8 @@ int iser_fast_reg_fmr(struct iscsi_iser_task *iser_task,
 
        reg->sge.lkey = fmr->fmr->lkey;
        reg->rkey = fmr->fmr->rkey;
-       reg->sge.addr = page_vec->pages[0] + page_vec->offset;
-       reg->sge.length = page_vec->data_size;
+       reg->sge.addr = page_vec->fake_mr.iova;
+       reg->sge.length = page_vec->fake_mr.length;
        reg->mem_h = fmr;
 
        iser_dbg("fmr reg: lkey=0x%x, rkey=0x%x, addr=0x%llx,"
@@ -413,19 +363,16 @@ iser_set_prot_checks(struct scsi_cmnd *sc, u8 *mask)
                *mask |= ISER_CHECK_GUARD;
 }
 
-static void
-iser_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr)
+static inline void
+iser_inv_rkey(struct ib_send_wr *inv_wr,
+             struct ib_mr *mr,
+             struct ib_cqe *cqe)
 {
-       u32 rkey;
-
        inv_wr->opcode = IB_WR_LOCAL_INV;
-       inv_wr->wr_id = ISER_FASTREG_LI_WRID;
+       inv_wr->wr_cqe = cqe;
        inv_wr->ex.invalidate_rkey = mr->rkey;
        inv_wr->send_flags = 0;
        inv_wr->num_sge = 0;
-
-       rkey = ib_inc_rkey(mr->rkey);
-       ib_update_fast_reg_key(mr, rkey);
 }
 
 static int
@@ -437,7 +384,9 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task,
 {
        struct iser_tx_desc *tx_desc = &iser_task->desc;
        struct ib_sig_attrs *sig_attrs = &tx_desc->sig_attrs;
+       struct ib_cqe *cqe = &iser_task->iser_conn->ib_conn.reg_cqe;
        struct ib_sig_handover_wr *wr;
+       struct ib_mr *mr = pi_ctx->sig_mr;
        int ret;
 
        memset(sig_attrs, 0, sizeof(*sig_attrs));
@@ -447,17 +396,19 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task,
 
        iser_set_prot_checks(iser_task->sc, &sig_attrs->check_mask);
 
-       if (!pi_ctx->sig_mr_valid)
-               iser_inv_rkey(iser_tx_next_wr(tx_desc), pi_ctx->sig_mr);
+       if (pi_ctx->sig_mr_valid)
+               iser_inv_rkey(iser_tx_next_wr(tx_desc), mr, cqe);
+
+       ib_update_fast_reg_key(mr, ib_inc_rkey(mr->rkey));
 
        wr = sig_handover_wr(iser_tx_next_wr(tx_desc));
        wr->wr.opcode = IB_WR_REG_SIG_MR;
-       wr->wr.wr_id = ISER_FASTREG_LI_WRID;
+       wr->wr.wr_cqe = cqe;
        wr->wr.sg_list = &data_reg->sge;
        wr->wr.num_sge = 1;
        wr->wr.send_flags = 0;
        wr->sig_attrs = sig_attrs;
-       wr->sig_mr = pi_ctx->sig_mr;
+       wr->sig_mr = mr;
        if (scsi_prot_sg_count(iser_task->sc))
                wr->prot = &prot_reg->sge;
        else
@@ -465,10 +416,10 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task,
        wr->access_flags = IB_ACCESS_LOCAL_WRITE |
                           IB_ACCESS_REMOTE_READ |
                           IB_ACCESS_REMOTE_WRITE;
-       pi_ctx->sig_mr_valid = 0;
+       pi_ctx->sig_mr_valid = 1;
 
-       sig_reg->sge.lkey = pi_ctx->sig_mr->lkey;
-       sig_reg->rkey = pi_ctx->sig_mr->rkey;
+       sig_reg->sge.lkey = mr->lkey;
+       sig_reg->rkey = mr->rkey;
        sig_reg->sge.addr = 0;
        sig_reg->sge.length = scsi_transfer_length(iser_task->sc);
 
@@ -485,12 +436,15 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
                            struct iser_mem_reg *reg)
 {
        struct iser_tx_desc *tx_desc = &iser_task->desc;
+       struct ib_cqe *cqe = &iser_task->iser_conn->ib_conn.reg_cqe;
        struct ib_mr *mr = rsc->mr;
        struct ib_reg_wr *wr;
        int n;
 
-       if (!rsc->mr_valid)
-               iser_inv_rkey(iser_tx_next_wr(tx_desc), mr);
+       if (rsc->mr_valid)
+               iser_inv_rkey(iser_tx_next_wr(tx_desc), mr, cqe);
+
+       ib_update_fast_reg_key(mr, ib_inc_rkey(mr->rkey));
 
        n = ib_map_mr_sg(mr, mem->sg, mem->size, SIZE_4K);
        if (unlikely(n != mem->size)) {
@@ -501,7 +455,7 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
 
        wr = reg_wr(iser_tx_next_wr(tx_desc));
        wr->wr.opcode = IB_WR_REG_MR;
-       wr->wr.wr_id = ISER_FASTREG_LI_WRID;
+       wr->wr.wr_cqe = cqe;
        wr->wr.send_flags = 0;
        wr->wr.num_sge = 0;
        wr->mr = mr;
@@ -510,7 +464,7 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
                     IB_ACCESS_REMOTE_WRITE |
                     IB_ACCESS_REMOTE_READ;
 
-       rsc->mr_valid = 0;
+       rsc->mr_valid = 1;
 
        reg->sge.lkey = mr->lkey;
        reg->rkey = mr->rkey;
@@ -554,7 +508,8 @@ iser_reg_data_sg(struct iscsi_iser_task *task,
 }
 
 int iser_reg_rdma_mem(struct iscsi_iser_task *task,
-                     enum iser_data_dir dir)
+                     enum iser_data_dir dir,
+                     bool all_imm)
 {
        struct ib_conn *ib_conn = &task->iser_conn->ib_conn;
        struct iser_device *device = ib_conn->device;
@@ -565,8 +520,8 @@ int iser_reg_rdma_mem(struct iscsi_iser_task *task,
        bool use_dma_key;
        int err;
 
-       use_dma_key = (mem->dma_nents == 1 && !iser_always_reg &&
-                      scsi_get_prot_op(task->sc) == SCSI_PROT_NORMAL);
+       use_dma_key = mem->dma_nents == 1 && (all_imm || !iser_always_reg) &&
+                     scsi_get_prot_op(task->sc) == SCSI_PROT_NORMAL;
 
        if (!use_dma_key) {
                desc = device->reg_ops->reg_desc_get(ib_conn);
index 42f4da620f2e9f97062b9183b6ebf3ed54e56cfe..40c0f4978e2f00b5173001f54ec41a4bc651b839 100644 (file)
 #define ISER_MAX_CQ_LEN                (ISER_MAX_RX_LEN + ISER_MAX_TX_LEN + \
                                 ISCSI_ISER_MAX_CONN)
 
-static int iser_cq_poll_limit = 512;
-
-static void iser_cq_tasklet_fn(unsigned long data);
-static void iser_cq_callback(struct ib_cq *cq, void *cq_context);
-
-static void iser_cq_event_callback(struct ib_event *cause, void *context)
-{
-       iser_err("cq event %s (%d)\n",
-                ib_event_msg(cause->event), cause->event);
-}
-
 static void iser_qp_event_callback(struct ib_event *cause, void *context)
 {
        iser_err("qp event %s (%d)\n",
@@ -78,59 +67,40 @@ static void iser_event_handler(struct ib_event_handler *handler,
  */
 static int iser_create_device_ib_res(struct iser_device *device)
 {
-       struct ib_device_attr *dev_attr = &device->dev_attr;
+       struct ib_device *ib_dev = device->ib_device;
        int ret, i, max_cqe;
 
-       ret = ib_query_device(device->ib_device, dev_attr);
-       if (ret) {
-               pr_warn("Query device failed for %s\n", device->ib_device->name);
-               return ret;
-       }
-
        ret = iser_assign_reg_ops(device);
        if (ret)
                return ret;
 
        device->comps_used = min_t(int, num_online_cpus(),
-                                device->ib_device->num_comp_vectors);
+                                ib_dev->num_comp_vectors);
 
        device->comps = kcalloc(device->comps_used, sizeof(*device->comps),
                                GFP_KERNEL);
        if (!device->comps)
                goto comps_err;
 
-       max_cqe = min(ISER_MAX_CQ_LEN, dev_attr->max_cqe);
+       max_cqe = min(ISER_MAX_CQ_LEN, ib_dev->attrs.max_cqe);
 
        iser_info("using %d CQs, device %s supports %d vectors max_cqe %d\n",
-                 device->comps_used, device->ib_device->name,
-                 device->ib_device->num_comp_vectors, max_cqe);
+                 device->comps_used, ib_dev->name,
+                 ib_dev->num_comp_vectors, max_cqe);
 
-       device->pd = ib_alloc_pd(device->ib_device);
+       device->pd = ib_alloc_pd(ib_dev);
        if (IS_ERR(device->pd))
                goto pd_err;
 
        for (i = 0; i < device->comps_used; i++) {
-               struct ib_cq_init_attr cq_attr = {};
                struct iser_comp *comp = &device->comps[i];
 
-               comp->device = device;
-               cq_attr.cqe = max_cqe;
-               cq_attr.comp_vector = i;
-               comp->cq = ib_create_cq(device->ib_device,
-                                       iser_cq_callback,
-                                       iser_cq_event_callback,
-                                       (void *)comp,
-                                       &cq_attr);
+               comp->cq = ib_alloc_cq(ib_dev, comp, max_cqe, i,
+                                      IB_POLL_SOFTIRQ);
                if (IS_ERR(comp->cq)) {
                        comp->cq = NULL;
                        goto cq_err;
                }
-
-               if (ib_req_notify_cq(comp->cq, IB_CQ_NEXT_COMP))
-                       goto cq_err;
-
-               tasklet_init(&comp->tasklet, iser_cq_tasklet_fn,
-                            (unsigned long)comp);
        }
 
        if (!iser_always_reg) {
@@ -140,11 +110,11 @@ static int iser_create_device_ib_res(struct iser_device *device)
 
                device->mr = ib_get_dma_mr(device->pd, access);
                if (IS_ERR(device->mr))
-                       goto dma_mr_err;
+                       goto cq_err;
        }
 
-       INIT_IB_EVENT_HANDLER(&device->event_handler, device->ib_device,
-                               iser_event_handler);
+       INIT_IB_EVENT_HANDLER(&device->event_handler, ib_dev,
+                             iser_event_handler);
        if (ib_register_event_handler(&device->event_handler))
                goto handler_err;
 
@@ -153,15 +123,12 @@ static int iser_create_device_ib_res(struct iser_device *device)
 handler_err:
        if (device->mr)
                ib_dereg_mr(device->mr);
-dma_mr_err:
-       for (i = 0; i < device->comps_used; i++)
-               tasklet_kill(&device->comps[i].tasklet);
 cq_err:
        for (i = 0; i < device->comps_used; i++) {
                struct iser_comp *comp = &device->comps[i];
 
                if (comp->cq)
-                       ib_destroy_cq(comp->cq);
+                       ib_free_cq(comp->cq);
        }
        ib_dealloc_pd(device->pd);
 pd_err:
@@ -182,8 +149,7 @@ static void iser_free_device_ib_res(struct iser_device *device)
        for (i = 0; i < device->comps_used; i++) {
                struct iser_comp *comp = &device->comps[i];
 
-               tasklet_kill(&comp->tasklet);
-               ib_destroy_cq(comp->cq);
+               ib_free_cq(comp->cq);
                comp->cq = NULL;
        }
 
@@ -299,7 +265,7 @@ iser_alloc_reg_res(struct ib_device *ib_device,
                iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret);
                return ret;
        }
-       res->mr_valid = 1;
+       res->mr_valid = 0;
 
        return 0;
 }
@@ -336,7 +302,7 @@ iser_alloc_pi_ctx(struct ib_device *ib_device,
                ret = PTR_ERR(pi_ctx->sig_mr);
                goto sig_mr_failure;
        }
-       pi_ctx->sig_mr_valid = 1;
+       pi_ctx->sig_mr_valid = 0;
        desc->pi_ctx->sig_protected = 0;
 
        return 0;
@@ -461,10 +427,9 @@ void iser_free_fastreg_pool(struct ib_conn *ib_conn)
  */
 static int iser_create_ib_conn_res(struct ib_conn *ib_conn)
 {
-       struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
-                                                  ib_conn);
+       struct iser_conn *iser_conn = to_iser_conn(ib_conn);
        struct iser_device      *device;
-       struct ib_device_attr *dev_attr;
+       struct ib_device        *ib_dev;
        struct ib_qp_init_attr  init_attr;
        int                     ret = -ENOMEM;
        int index, min_index = 0;
@@ -472,7 +437,7 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn)
        BUG_ON(ib_conn->device == NULL);
 
        device = ib_conn->device;
-       dev_attr = &device->dev_attr;
+       ib_dev = device->ib_device;
 
        memset(&init_attr, 0, sizeof init_attr);
 
@@ -503,16 +468,16 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn)
                iser_conn->max_cmds =
                        ISER_GET_MAX_XMIT_CMDS(ISER_QP_SIG_MAX_REQ_DTOS);
        } else {
-               if (dev_attr->max_qp_wr > ISER_QP_MAX_REQ_DTOS) {
+               if (ib_dev->attrs.max_qp_wr > ISER_QP_MAX_REQ_DTOS) {
                        init_attr.cap.max_send_wr  = ISER_QP_MAX_REQ_DTOS + 1;
                        iser_conn->max_cmds =
                                ISER_GET_MAX_XMIT_CMDS(ISER_QP_MAX_REQ_DTOS);
                } else {
-                       init_attr.cap.max_send_wr = dev_attr->max_qp_wr;
+                       init_attr.cap.max_send_wr = ib_dev->attrs.max_qp_wr;
                        iser_conn->max_cmds =
-                               ISER_GET_MAX_XMIT_CMDS(dev_attr->max_qp_wr);
+                               ISER_GET_MAX_XMIT_CMDS(ib_dev->attrs.max_qp_wr);
                        iser_dbg("device %s supports max_send_wr %d\n",
-                                device->ib_device->name, dev_attr->max_qp_wr);
+                                device->ib_device->name, ib_dev->attrs.max_qp_wr);
                }
        }
 
@@ -724,13 +689,13 @@ int iser_conn_terminate(struct iser_conn *iser_conn)
                                 iser_conn, err);
 
                /* post an indication that all flush errors were consumed */
-               err = ib_post_send(ib_conn->qp, &ib_conn->beacon, &bad_wr);
+               err = ib_post_send(ib_conn->qp, &ib_conn->last, &bad_wr);
                if (err) {
-                       iser_err("conn %p failed to post beacon", ib_conn);
+                       iser_err("conn %p failed to post last wr", ib_conn);
                        return 1;
                }
 
-               wait_for_completion(&ib_conn->flush_comp);
+               wait_for_completion(&ib_conn->last_comp);
        }
 
        return 1;
@@ -756,7 +721,7 @@ iser_calc_scsi_params(struct iser_conn *iser_conn,
 
        sg_tablesize = DIV_ROUND_UP(max_sectors * 512, SIZE_4K);
        sup_sg_tablesize = min_t(unsigned, ISCSI_ISER_MAX_SG_TABLESIZE,
-                                device->dev_attr.max_fast_reg_page_list_len);
+                                device->ib_device->attrs.max_fast_reg_page_list_len);
 
        if (sg_tablesize > sup_sg_tablesize) {
                sg_tablesize = sup_sg_tablesize;
@@ -799,7 +764,7 @@ static void iser_addr_handler(struct rdma_cm_id *cma_id)
 
        /* connection T10-PI support */
        if (iser_pi_enable) {
-               if (!(device->dev_attr.device_cap_flags &
+               if (!(device->ib_device->attrs.device_cap_flags &
                      IB_DEVICE_SIGNATURE_HANDOVER)) {
                        iser_warn("T10-PI requested but not supported on %s, "
                                  "continue without T10-PI\n",
@@ -841,16 +806,17 @@ static void iser_route_handler(struct rdma_cm_id *cma_id)
                goto failure;
 
        memset(&conn_param, 0, sizeof conn_param);
-       conn_param.responder_resources = device->dev_attr.max_qp_rd_atom;
+       conn_param.responder_resources = device->ib_device->attrs.max_qp_rd_atom;
        conn_param.initiator_depth     = 1;
        conn_param.retry_count         = 7;
        conn_param.rnr_retry_count     = 6;
 
        memset(&req_hdr, 0, sizeof(req_hdr));
-       req_hdr.flags = (ISER_ZBVA_NOT_SUPPORTED |
-                       ISER_SEND_W_INV_NOT_SUPPORTED);
-       conn_param.private_data         = (void *)&req_hdr;
-       conn_param.private_data_len     = sizeof(struct iser_cm_hdr);
+       req_hdr.flags = ISER_ZBVA_NOT_SUP;
+       if (!device->remote_inv_sup)
+               req_hdr.flags |= ISER_SEND_W_INV_NOT_SUP;
+       conn_param.private_data = (void *)&req_hdr;
+       conn_param.private_data_len = sizeof(struct iser_cm_hdr);
 
        ret = rdma_connect(cma_id, &conn_param);
        if (ret) {
@@ -863,7 +829,8 @@ failure:
        iser_connect_error(cma_id);
 }
 
-static void iser_connected_handler(struct rdma_cm_id *cma_id)
+static void iser_connected_handler(struct rdma_cm_id *cma_id,
+                                  const void *private_data)
 {
        struct iser_conn *iser_conn;
        struct ib_qp_attr attr;
@@ -877,6 +844,15 @@ static void iser_connected_handler(struct rdma_cm_id *cma_id)
        (void)ib_query_qp(cma_id->qp, &attr, ~0, &init_attr);
        iser_info("remote qpn:%x my qpn:%x\n", attr.dest_qp_num, cma_id->qp->qp_num);
 
+       if (private_data) {
+               u8 flags = *(u8 *)private_data;
+
+               iser_conn->snd_w_inv = !(flags & ISER_SEND_W_INV_NOT_SUP);
+       }
+
+       iser_info("conn %p: negotiated %s invalidation\n",
+                 iser_conn, iser_conn->snd_w_inv ? "remote" : "local");
+
        iser_conn->state = ISER_CONN_UP;
        complete(&iser_conn->up_completion);
 }
@@ -928,7 +904,7 @@ static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *eve
                iser_route_handler(cma_id);
                break;
        case RDMA_CM_EVENT_ESTABLISHED:
-               iser_connected_handler(cma_id);
+               iser_connected_handler(cma_id, event->param.conn.private_data);
                break;
        case RDMA_CM_EVENT_ADDR_ERROR:
        case RDMA_CM_EVENT_ROUTE_ERROR:
@@ -967,14 +943,21 @@ static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *eve
 
 void iser_conn_init(struct iser_conn *iser_conn)
 {
+       struct ib_conn *ib_conn = &iser_conn->ib_conn;
+
        iser_conn->state = ISER_CONN_INIT;
-       iser_conn->ib_conn.post_recv_buf_count = 0;
-       init_completion(&iser_conn->ib_conn.flush_comp);
        init_completion(&iser_conn->stop_completion);
        init_completion(&iser_conn->ib_completion);
        init_completion(&iser_conn->up_completion);
        INIT_LIST_HEAD(&iser_conn->conn_list);
        mutex_init(&iser_conn->state_mutex);
+
+       ib_conn->post_recv_buf_count = 0;
+       ib_conn->reg_cqe.done = iser_reg_comp;
+       ib_conn->last_cqe.done = iser_last_comp;
+       ib_conn->last.wr_cqe = &ib_conn->last_cqe;
+       ib_conn->last.opcode = IB_WR_SEND;
+       init_completion(&ib_conn->last_comp);
 }
 
  /**
@@ -1000,9 +983,6 @@ int iser_connect(struct iser_conn   *iser_conn,
 
        iser_conn->state = ISER_CONN_PENDING;
 
-       ib_conn->beacon.wr_id = ISER_BEACON_WRID;
-       ib_conn->beacon.opcode = IB_WR_SEND;
-
        ib_conn->cma_id = rdma_create_id(&init_net, iser_cma_handler,
                                         (void *)iser_conn,
                                         RDMA_PS_TCP, IB_QPT_RC);
@@ -1045,56 +1025,60 @@ connect_failure:
 
 int iser_post_recvl(struct iser_conn *iser_conn)
 {
-       struct ib_recv_wr rx_wr, *rx_wr_failed;
        struct ib_conn *ib_conn = &iser_conn->ib_conn;
-       struct ib_sge     sge;
+       struct iser_login_desc *desc = &iser_conn->login_desc;
+       struct ib_recv_wr wr, *wr_failed;
        int ib_ret;
 
-       sge.addr   = iser_conn->login_resp_dma;
-       sge.length = ISER_RX_LOGIN_SIZE;
-       sge.lkey   = ib_conn->device->pd->local_dma_lkey;
+       desc->sge.addr = desc->rsp_dma;
+       desc->sge.length = ISER_RX_LOGIN_SIZE;
+       desc->sge.lkey = ib_conn->device->pd->local_dma_lkey;
 
-       rx_wr.wr_id   = (uintptr_t)iser_conn->login_resp_buf;
-       rx_wr.sg_list = &sge;
-       rx_wr.num_sge = 1;
-       rx_wr.next    = NULL;
+       desc->cqe.done = iser_login_rsp;
+       wr.wr_cqe = &desc->cqe;
+       wr.sg_list = &desc->sge;
+       wr.num_sge = 1;
+       wr.next = NULL;
 
        ib_conn->post_recv_buf_count++;
-       ib_ret  = ib_post_recv(ib_conn->qp, &rx_wr, &rx_wr_failed);
+       ib_ret = ib_post_recv(ib_conn->qp, &wr, &wr_failed);
        if (ib_ret) {
                iser_err("ib_post_recv failed ret=%d\n", ib_ret);
                ib_conn->post_recv_buf_count--;
        }
+
        return ib_ret;
 }
 
 int iser_post_recvm(struct iser_conn *iser_conn, int count)
 {
-       struct ib_recv_wr *rx_wr, *rx_wr_failed;
-       int i, ib_ret;
        struct ib_conn *ib_conn = &iser_conn->ib_conn;
        unsigned int my_rx_head = iser_conn->rx_desc_head;
        struct iser_rx_desc *rx_desc;
+       struct ib_recv_wr *wr, *wr_failed;
+       int i, ib_ret;
 
-       for (rx_wr = ib_conn->rx_wr, i = 0; i < count; i++, rx_wr++) {
-               rx_desc         = &iser_conn->rx_descs[my_rx_head];
-               rx_wr->wr_id    = (uintptr_t)rx_desc;
-               rx_wr->sg_list  = &rx_desc->rx_sg;
-               rx_wr->num_sge  = 1;
-               rx_wr->next     = rx_wr + 1;
+       for (wr = ib_conn->rx_wr, i = 0; i < count; i++, wr++) {
+               rx_desc = &iser_conn->rx_descs[my_rx_head];
+               rx_desc->cqe.done = iser_task_rsp;
+               wr->wr_cqe = &rx_desc->cqe;
+               wr->sg_list = &rx_desc->rx_sg;
+               wr->num_sge = 1;
+               wr->next = wr + 1;
                my_rx_head = (my_rx_head + 1) & iser_conn->qp_max_recv_dtos_mask;
        }
 
-       rx_wr--;
-       rx_wr->next = NULL; /* mark end of work requests list */
+       wr--;
+       wr->next = NULL; /* mark end of work requests list */
 
        ib_conn->post_recv_buf_count += count;
-       ib_ret  = ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &rx_wr_failed);
+       ib_ret = ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &wr_failed);
        if (ib_ret) {
                iser_err("ib_post_recv failed ret=%d\n", ib_ret);
                ib_conn->post_recv_buf_count -= count;
        } else
                iser_conn->rx_desc_head = my_rx_head;
+
        return ib_ret;
 }
 
@@ -1115,7 +1099,7 @@ int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc,
                                      DMA_TO_DEVICE);
 
        wr->next = NULL;
-       wr->wr_id = (uintptr_t)tx_desc;
+       wr->wr_cqe = &tx_desc->cqe;
        wr->sg_list = tx_desc->tx_sg;
        wr->num_sge = tx_desc->num_sge;
        wr->opcode = IB_WR_SEND;
@@ -1129,149 +1113,6 @@ int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc,
        return ib_ret;
 }
 
-/**
- * is_iser_tx_desc - Indicate if the completion wr_id
- *     is a TX descriptor or not.
- * @iser_conn: iser connection
- * @wr_id: completion WR identifier
- *
- * Since we cannot rely on wc opcode in FLUSH errors
- * we must work around it by checking if the wr_id address
- * falls in the iser connection rx_descs buffer. If so
- * it is an RX descriptor, otherwize it is a TX.
- */
-static inline bool
-is_iser_tx_desc(struct iser_conn *iser_conn, void *wr_id)
-{
-       void *start = iser_conn->rx_descs;
-       int len = iser_conn->num_rx_descs * sizeof(*iser_conn->rx_descs);
-
-       if (wr_id >= start && wr_id < start + len)
-               return false;
-
-       return true;
-}
-
-/**
- * iser_handle_comp_error() - Handle error completion
- * @ib_conn:   connection RDMA resources
- * @wc:        work completion
- *
- * Notes: We may handle a FLUSH error completion and in this case
- *        we only cleanup in case TX type was DATAOUT. For non-FLUSH
- *        error completion we should also notify iscsi layer that
- *        connection is failed (in case we passed bind stage).
- */
-static void
-iser_handle_comp_error(struct ib_conn *ib_conn,
-                      struct ib_wc *wc)
-{
-       void *wr_id = (void *)(uintptr_t)wc->wr_id;
-       struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
-                                                  ib_conn);
-
-       if (wc->status != IB_WC_WR_FLUSH_ERR)
-               if (iser_conn->iscsi_conn)
-                       iscsi_conn_failure(iser_conn->iscsi_conn,
-                                          ISCSI_ERR_CONN_FAILED);
-
-       if (wc->wr_id == ISER_FASTREG_LI_WRID)
-               return;
-
-       if (is_iser_tx_desc(iser_conn, wr_id)) {
-               struct iser_tx_desc *desc = wr_id;
-
-               if (desc->type == ISCSI_TX_DATAOUT)
-                       kmem_cache_free(ig.desc_cache, desc);
-       } else {
-               ib_conn->post_recv_buf_count--;
-       }
-}
-
-/**
- * iser_handle_wc - handle a single work completion
- * @wc: work completion
- *
- * Soft-IRQ context, work completion can be either
- * SEND or RECV, and can turn out successful or
- * with error (or flush error).
- */
-static void iser_handle_wc(struct ib_wc *wc)
-{
-       struct ib_conn *ib_conn;
-       struct iser_tx_desc *tx_desc;
-       struct iser_rx_desc *rx_desc;
-
-       ib_conn = wc->qp->qp_context;
-       if (likely(wc->status == IB_WC_SUCCESS)) {
-               if (wc->opcode == IB_WC_RECV) {
-                       rx_desc = (struct iser_rx_desc *)(uintptr_t)wc->wr_id;
-                       iser_rcv_completion(rx_desc, wc->byte_len,
-                                           ib_conn);
-               } else
-               if (wc->opcode == IB_WC_SEND) {
-                       tx_desc = (struct iser_tx_desc *)(uintptr_t)wc->wr_id;
-                       iser_snd_completion(tx_desc, ib_conn);
-               } else {
-                       iser_err("Unknown wc opcode %d\n", wc->opcode);
-               }
-       } else {
-               if (wc->status != IB_WC_WR_FLUSH_ERR)
-                       iser_err("%s (%d): wr id %llx vend_err %x\n",
-                                ib_wc_status_msg(wc->status), wc->status,
-                                wc->wr_id, wc->vendor_err);
-               else
-                       iser_dbg("%s (%d): wr id %llx\n",
-                                ib_wc_status_msg(wc->status), wc->status,
-                                wc->wr_id);
-
-               if (wc->wr_id == ISER_BEACON_WRID)
-                       /* all flush errors were consumed */
-                       complete(&ib_conn->flush_comp);
-               else
-                       iser_handle_comp_error(ib_conn, wc);
-       }
-}
-
-/**
- * iser_cq_tasklet_fn - iSER completion polling loop
- * @data: iSER completion context
- *
- * Soft-IRQ context, polling connection CQ until
- * either CQ was empty or we exausted polling budget
- */
-static void iser_cq_tasklet_fn(unsigned long data)
-{
-       struct iser_comp *comp = (struct iser_comp *)data;
-       struct ib_cq *cq = comp->cq;
-       struct ib_wc *const wcs = comp->wcs;
-       int i, n, completed = 0;
-
-       while ((n = ib_poll_cq(cq, ARRAY_SIZE(comp->wcs), wcs)) > 0) {
-               for (i = 0; i < n; i++)
-                       iser_handle_wc(&wcs[i]);
-
-               completed += n;
-               if (completed >= iser_cq_poll_limit)
-                       break;
-       }
-
-       /*
-        * It is assumed here that arming CQ only once its empty
-        * would not cause interrupts to be missed.
-        */
-       ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
-
-       iser_dbg("got %d completions\n", completed);
-}
-
-static void iser_cq_callback(struct ib_cq *cq, void *cq_context)
-{
-       struct iser_comp *comp = cq_context;
-
-       tasklet_schedule(&comp->tasklet);
-}
-
 u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task,
                             enum iser_data_dir cmd_dir, sector_t *sector)
 {
@@ -1319,3 +1160,21 @@ err:
        /* Not alot we can do here, return ambiguous guard error */
        return 0x1;
 }
+
+void iser_err_comp(struct ib_wc *wc, const char *type)
+{
+       if (wc->status != IB_WC_WR_FLUSH_ERR) {
+               struct iser_conn *iser_conn = to_iser_conn(wc->qp->qp_context);
+
+               iser_err("%s failure: %s (%d) vend_err %x\n", type,
+                        ib_wc_status_msg(wc->status), wc->status,
+                        wc->vendor_err);
+
+               if (iser_conn->iscsi_conn)
+                       iscsi_conn_failure(iser_conn->iscsi_conn,
+                                          ISCSI_ERR_CONN_FAILED);
+       } else {
+               iser_dbg("%s failure: %s (%d)\n", type,
+                        ib_wc_status_msg(wc->status), wc->status);
+       }
+}
index 468c5e13256368c8b23b5ab25c903ead808f490f..f121e612933913dc9b24ccdbb41f63a5d8f09b84 100644 (file)
@@ -29,7 +29,6 @@
 #include <target/iscsi/iscsi_transport.h>
 #include <linux/semaphore.h>
 
-#include "isert_proto.h"
 #include "ib_isert.h"
 
 #define        ISERT_MAX_CONN          8
@@ -95,22 +94,6 @@ isert_qp_event_callback(struct ib_event *e, void *context)
        }
 }
 
-static int
-isert_query_device(struct ib_device *ib_dev, struct ib_device_attr *devattr)
-{
-       int ret;
-
-       ret = ib_query_device(ib_dev, devattr);
-       if (ret) {
-               isert_err("ib_query_device() failed: %d\n", ret);
-               return ret;
-       }
-       isert_dbg("devattr->max_sge: %d\n", devattr->max_sge);
-       isert_dbg("devattr->max_sge_rd: %d\n", devattr->max_sge_rd);
-
-       return 0;
-}
-
 static struct isert_comp *
 isert_comp_get(struct isert_conn *isert_conn)
 {
@@ -157,9 +140,9 @@ isert_create_qp(struct isert_conn *isert_conn,
        attr.recv_cq = comp->cq;
        attr.cap.max_send_wr = ISERT_QP_MAX_REQ_DTOS;
        attr.cap.max_recv_wr = ISERT_QP_MAX_RECV_DTOS + 1;
-       attr.cap.max_send_sge = device->dev_attr.max_sge;
-       isert_conn->max_sge = min(device->dev_attr.max_sge,
-                                 device->dev_attr.max_sge_rd);
+       attr.cap.max_send_sge = device->ib_device->attrs.max_sge;
+       isert_conn->max_sge = min(device->ib_device->attrs.max_sge,
+                                 device->ib_device->attrs.max_sge_rd);
        attr.cap.max_recv_sge = 1;
        attr.sq_sig_type = IB_SIGNAL_REQ_WR;
        attr.qp_type = IB_QPT_RC;
@@ -287,8 +270,7 @@ isert_free_comps(struct isert_device *device)
 }
 
 static int
-isert_alloc_comps(struct isert_device *device,
-                 struct ib_device_attr *attr)
+isert_alloc_comps(struct isert_device *device)
 {
        int i, max_cqe, ret = 0;
 
@@ -308,7 +290,7 @@ isert_alloc_comps(struct isert_device *device,
                return -ENOMEM;
        }
 
-       max_cqe = min(ISER_MAX_CQ_LEN, attr->max_cqe);
+       max_cqe = min(ISER_MAX_CQ_LEN, device->ib_device->attrs.max_cqe);
 
        for (i = 0; i < device->comps_used; i++) {
                struct ib_cq_init_attr cq_attr = {};
@@ -344,17 +326,15 @@ out_cq:
 static int
 isert_create_device_ib_res(struct isert_device *device)
 {
-       struct ib_device_attr *dev_attr;
+       struct ib_device *ib_dev = device->ib_device;
        int ret;
 
-       dev_attr = &device->dev_attr;
-       ret = isert_query_device(device->ib_device, dev_attr);
-       if (ret)
-               goto out;
+       isert_dbg("devattr->max_sge: %d\n", ib_dev->attrs.max_sge);
+       isert_dbg("devattr->max_sge_rd: %d\n", ib_dev->attrs.max_sge_rd);
 
        /* asign function handlers */
-       if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS &&
-           dev_attr->device_cap_flags & IB_DEVICE_SIGNATURE_HANDOVER) {
+       if (ib_dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS &&
+           ib_dev->attrs.device_cap_flags & IB_DEVICE_SIGNATURE_HANDOVER) {
                device->use_fastreg = 1;
                device->reg_rdma_mem = isert_reg_rdma;
                device->unreg_rdma_mem = isert_unreg_rdma;
@@ -364,11 +344,11 @@ isert_create_device_ib_res(struct isert_device *device)
                device->unreg_rdma_mem = isert_unmap_cmd;
        }
 
-       ret = isert_alloc_comps(device, dev_attr);
+       ret = isert_alloc_comps(device);
        if (ret)
                goto out;
 
-       device->pd = ib_alloc_pd(device->ib_device);
+       device->pd = ib_alloc_pd(ib_dev);
        if (IS_ERR(device->pd)) {
                ret = PTR_ERR(device->pd);
                isert_err("failed to allocate pd, device %p, ret=%d\n",
@@ -377,7 +357,7 @@ isert_create_device_ib_res(struct isert_device *device)
        }
 
        /* Check signature cap */
-       device->pi_capable = dev_attr->device_cap_flags &
+       device->pi_capable = ib_dev->attrs.device_cap_flags &
                             IB_DEVICE_SIGNATURE_HANDOVER ? true : false;
 
        return 0;
@@ -676,6 +656,32 @@ out_login_buf:
        return ret;
 }
 
+static void
+isert_set_nego_params(struct isert_conn *isert_conn,
+                     struct rdma_conn_param *param)
+{
+       struct ib_device_attr *attr = &isert_conn->device->ib_device->attrs;
+
+       /* Set max inflight RDMA READ requests */
+       isert_conn->initiator_depth = min_t(u8, param->initiator_depth,
+                               attr->max_qp_init_rd_atom);
+       isert_dbg("Using initiator_depth: %u\n", isert_conn->initiator_depth);
+
+       if (param->private_data) {
+               u8 flags = *(u8 *)param->private_data;
+
+               /*
+                * use remote invalidation if the both initiator
+                * and the HCA support it
+                */
+               isert_conn->snd_w_inv = !(flags & ISER_SEND_W_INV_NOT_SUP) &&
+                                         (attr->device_cap_flags &
+                                          IB_DEVICE_MEM_MGT_EXTENSIONS);
+               if (isert_conn->snd_w_inv)
+                       isert_info("Using remote invalidation\n");
+       }
+}
+
 static int
 isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
 {
@@ -714,11 +720,7 @@ isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
        }
        isert_conn->device = device;
 
-       /* Set max inflight RDMA READ requests */
-       isert_conn->initiator_depth = min_t(u8,
-                               event->param.conn.initiator_depth,
-                               device->dev_attr.max_qp_init_rd_atom);
-       isert_dbg("Using initiator_depth: %u\n", isert_conn->initiator_depth);
+       isert_set_nego_params(isert_conn, &event->param.conn);
 
        ret = isert_conn_setup_qp(isert_conn, cma_id);
        if (ret)
@@ -1050,8 +1052,8 @@ isert_create_send_desc(struct isert_conn *isert_conn,
        ib_dma_sync_single_for_cpu(ib_dev, tx_desc->dma_addr,
                                   ISER_HEADERS_LEN, DMA_TO_DEVICE);
 
-       memset(&tx_desc->iser_header, 0, sizeof(struct iser_hdr));
-       tx_desc->iser_header.flags = ISER_VER;
+       memset(&tx_desc->iser_header, 0, sizeof(struct iser_ctrl));
+       tx_desc->iser_header.flags = ISCSI_CTRL;
 
        tx_desc->num_sge = 1;
        tx_desc->isert_cmd = isert_cmd;
@@ -1097,7 +1099,14 @@ isert_init_send_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd,
 
        isert_cmd->rdma_wr.iser_ib_op = ISER_IB_SEND;
        send_wr->wr_id = (uintptr_t)&isert_cmd->tx_desc;
-       send_wr->opcode = IB_WR_SEND;
+
+       if (isert_conn->snd_w_inv && isert_cmd->inv_rkey) {
+               send_wr->opcode  = IB_WR_SEND_WITH_INV;
+               send_wr->ex.invalidate_rkey = isert_cmd->inv_rkey;
+       } else {
+               send_wr->opcode = IB_WR_SEND;
+       }
+
        send_wr->sg_list = &tx_desc->tx_sg[0];
        send_wr->num_sge = isert_cmd->tx_desc.num_sge;
        send_wr->send_flags = IB_SEND_SIGNALED;
@@ -1486,6 +1495,7 @@ isert_rx_opcode(struct isert_conn *isert_conn, struct iser_rx_desc *rx_desc,
                isert_cmd->read_va = read_va;
                isert_cmd->write_stag = write_stag;
                isert_cmd->write_va = write_va;
+               isert_cmd->inv_rkey = read_stag ? read_stag : write_stag;
 
                ret = isert_handle_scsi_cmd(isert_conn, isert_cmd, cmd,
                                        rx_desc, (unsigned char *)hdr);
@@ -1543,21 +1553,21 @@ isert_rx_opcode(struct isert_conn *isert_conn, struct iser_rx_desc *rx_desc,
 static void
 isert_rx_do_work(struct iser_rx_desc *rx_desc, struct isert_conn *isert_conn)
 {
-       struct iser_hdr *iser_hdr = &rx_desc->iser_header;
+       struct iser_ctrl *iser_ctrl = &rx_desc->iser_header;
        uint64_t read_va = 0, write_va = 0;
        uint32_t read_stag = 0, write_stag = 0;
 
-       switch (iser_hdr->flags & 0xF0) {
+       switch (iser_ctrl->flags & 0xF0) {
        case ISCSI_CTRL:
-               if (iser_hdr->flags & ISER_RSV) {
-                       read_stag = be32_to_cpu(iser_hdr->read_stag);
-                       read_va = be64_to_cpu(iser_hdr->read_va);
+               if (iser_ctrl->flags & ISER_RSV) {
+                       read_stag = be32_to_cpu(iser_ctrl->read_stag);
+                       read_va = be64_to_cpu(iser_ctrl->read_va);
                        isert_dbg("ISER_RSV: read_stag: 0x%x read_va: 0x%llx\n",
                                  read_stag, (unsigned long long)read_va);
                }
-               if (iser_hdr->flags & ISER_WSV) {
-                       write_stag = be32_to_cpu(iser_hdr->write_stag);
-                       write_va = be64_to_cpu(iser_hdr->write_va);
+               if (iser_ctrl->flags & ISER_WSV) {
+                       write_stag = be32_to_cpu(iser_ctrl->write_stag);
+                       write_va = be64_to_cpu(iser_ctrl->write_va);
                        isert_dbg("ISER_WSV: write_stag: 0x%x write_va: 0x%llx\n",
                                  write_stag, (unsigned long long)write_va);
                }
@@ -1568,7 +1578,7 @@ isert_rx_do_work(struct iser_rx_desc *rx_desc, struct isert_conn *isert_conn)
                isert_err("iSER Hello message\n");
                break;
        default:
-               isert_warn("Unknown iSER hdr flags: 0x%02x\n", iser_hdr->flags);
+               isert_warn("Unknown iSER hdr flags: 0x%02x\n", iser_ctrl->flags);
                break;
        }
 
@@ -3095,12 +3105,20 @@ isert_rdma_accept(struct isert_conn *isert_conn)
        struct rdma_cm_id *cm_id = isert_conn->cm_id;
        struct rdma_conn_param cp;
        int ret;
+       struct iser_cm_hdr rsp_hdr;
 
        memset(&cp, 0, sizeof(struct rdma_conn_param));
        cp.initiator_depth = isert_conn->initiator_depth;
        cp.retry_count = 7;
        cp.rnr_retry_count = 7;
 
+       memset(&rsp_hdr, 0, sizeof(rsp_hdr));
+       rsp_hdr.flags = ISERT_ZBVA_NOT_USED;
+       if (!isert_conn->snd_w_inv)
+               rsp_hdr.flags = rsp_hdr.flags | ISERT_SEND_W_INV_NOT_USED;
+       cp.private_data = (void *)&rsp_hdr;
+       cp.private_data_len = sizeof(rsp_hdr);
+
        ret = rdma_accept(cm_id, &cp);
        if (ret) {
                isert_err("rdma_accept() failed with: %d\n", ret);
index 3d7fbc47c3434c32308136b7faec65590b74d439..8d50453eef66ce11d3cf321a535be0abe6b16d43 100644 (file)
@@ -3,6 +3,8 @@
 #include <linux/in6.h>
 #include <rdma/ib_verbs.h>
 #include <rdma/rdma_cm.h>
+#include <scsi/iser.h>
+
 
 #define DRV_NAME       "isert"
 #define PFX            DRV_NAME ": "
 #define isert_err(fmt, arg...) \
        pr_err(PFX "%s: " fmt, __func__ , ## arg)
 
+/* Constant PDU lengths calculations */
+#define ISER_HEADERS_LEN       (sizeof(struct iser_ctrl) + \
+                                sizeof(struct iscsi_hdr))
+#define ISER_RECV_DATA_SEG_LEN 8192
+#define ISER_RX_PAYLOAD_SIZE   (ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN)
+#define ISER_RX_LOGIN_SIZE     (ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN)
+
+/* QP settings */
+/* Maximal bounds on received asynchronous PDUs */
+#define ISERT_MAX_TX_MISC_PDUS 4 /* NOOP_IN(2) , ASYNC_EVENT(2)   */
+
+#define ISERT_MAX_RX_MISC_PDUS 6 /*
+                                  * NOOP_OUT(2), TEXT(1),
+                                  * SCSI_TMFUNC(2), LOGOUT(1)
+                                  */
+
+#define ISCSI_DEF_XMIT_CMDS_MAX 128 /* from libiscsi.h, must be power of 2 */
+
+#define ISERT_QP_MAX_RECV_DTOS (ISCSI_DEF_XMIT_CMDS_MAX)
+
+#define ISERT_MIN_POSTED_RX    (ISCSI_DEF_XMIT_CMDS_MAX >> 2)
+
+#define ISERT_INFLIGHT_DATAOUTS        8
+
+#define ISERT_QP_MAX_REQ_DTOS  (ISCSI_DEF_XMIT_CMDS_MAX *    \
+                               (1 + ISERT_INFLIGHT_DATAOUTS) + \
+                               ISERT_MAX_TX_MISC_PDUS  + \
+                               ISERT_MAX_RX_MISC_PDUS)
+
+#define ISER_RX_PAD_SIZE       (ISER_RECV_DATA_SEG_LEN + 4096 - \
+               (ISER_RX_PAYLOAD_SIZE + sizeof(u64) + sizeof(struct ib_sge)))
+
 #define ISCSI_ISER_SG_TABLESIZE                256
 #define ISER_FASTREG_LI_WRID           0xffffffffffffffffULL
 #define ISER_BEACON_WRID               0xfffffffffffffffeULL
@@ -56,7 +90,7 @@ enum iser_conn_state {
 };
 
 struct iser_rx_desc {
-       struct iser_hdr iser_header;
+       struct iser_ctrl iser_header;
        struct iscsi_hdr iscsi_header;
        char            data[ISER_RECV_DATA_SEG_LEN];
        u64             dma_addr;
@@ -65,7 +99,7 @@ struct iser_rx_desc {
 } __packed;
 
 struct iser_tx_desc {
-       struct iser_hdr iser_header;
+       struct iser_ctrl iser_header;
        struct iscsi_hdr iscsi_header;
        enum isert_desc_type type;
        u64             dma_addr;
@@ -129,6 +163,7 @@ struct isert_cmd {
        uint32_t                write_stag;
        uint64_t                read_va;
        uint64_t                write_va;
+       uint32_t                inv_rkey;
        u64                     pdu_buf_dma;
        u32                     pdu_buf_len;
        struct isert_conn       *conn;
@@ -176,6 +211,7 @@ struct isert_conn {
        struct work_struct      release_work;
        struct ib_recv_wr       beacon;
        bool                    logout_posted;
+       bool                    snd_w_inv;
 };
 
 #define ISERT_MAX_CQ 64
@@ -207,7 +243,6 @@ struct isert_device {
        struct isert_comp       *comps;
        int                     comps_used;
        struct list_head        dev_node;
-       struct ib_device_attr   dev_attr;
        int                     (*reg_rdma_mem)(struct iscsi_conn *conn,
                                                    struct iscsi_cmd *cmd,
                                                    struct isert_rdma_wr *wr);
diff --git a/drivers/infiniband/ulp/isert/isert_proto.h b/drivers/infiniband/ulp/isert/isert_proto.h
deleted file mode 100644 (file)
index 4dccd31..0000000
+++ /dev/null
@@ -1,47 +0,0 @@
-/* From iscsi_iser.h */
-
-struct iser_hdr {
-       u8      flags;
-       u8      rsvd[3];
-       __be32  write_stag; /* write rkey */
-       __be64  write_va;
-       __be32  read_stag;  /* read rkey */
-       __be64  read_va;
-} __packed;
-
-/*Constant PDU lengths calculations */
-#define ISER_HEADERS_LEN  (sizeof(struct iser_hdr) + sizeof(struct iscsi_hdr))
-
-#define ISER_RECV_DATA_SEG_LEN  8192
-#define ISER_RX_PAYLOAD_SIZE    (ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN)
-#define ISER_RX_LOGIN_SIZE      (ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN)
-
-/* QP settings */
-/* Maximal bounds on received asynchronous PDUs */
-#define ISERT_MAX_TX_MISC_PDUS 4 /* NOOP_IN(2) , ASYNC_EVENT(2)   */
-
-#define ISERT_MAX_RX_MISC_PDUS 6 /* NOOP_OUT(2), TEXT(1),         *
-                                  * SCSI_TMFUNC(2), LOGOUT(1) */
-
-#define ISCSI_DEF_XMIT_CMDS_MAX 128 /* from libiscsi.h, must be power of 2 */
-
-#define ISERT_QP_MAX_RECV_DTOS (ISCSI_DEF_XMIT_CMDS_MAX)
-
-#define ISERT_MIN_POSTED_RX    (ISCSI_DEF_XMIT_CMDS_MAX >> 2)
-
-#define ISERT_INFLIGHT_DATAOUTS        8
-
-#define ISERT_QP_MAX_REQ_DTOS  (ISCSI_DEF_XMIT_CMDS_MAX *    \
-                               (1 + ISERT_INFLIGHT_DATAOUTS) + \
-                               ISERT_MAX_TX_MISC_PDUS  + \
-                               ISERT_MAX_RX_MISC_PDUS)
-
-#define ISER_RX_PAD_SIZE       (ISER_RECV_DATA_SEG_LEN + 4096 - \
-               (ISER_RX_PAYLOAD_SIZE + sizeof(u64) + sizeof(struct ib_sge)))
-
-#define ISER_VER       0x10
-#define ISER_WSV       0x08
-#define ISER_RSV       0x04
-#define ISCSI_CTRL     0x10
-#define ISER_HELLO     0x20
-#define ISER_HELLORPLY 0x30
index 3db9a659719b0f6283af610bf01b8f65d27292a9..03022f6420d770a469d818ee07fb7a4a4babda3b 100644 (file)
@@ -132,8 +132,9 @@ MODULE_PARM_DESC(ch_count,
 
 static void srp_add_one(struct ib_device *device);
 static void srp_remove_one(struct ib_device *device, void *client_data);
-static void srp_recv_completion(struct ib_cq *cq, void *ch_ptr);
-static void srp_send_completion(struct ib_cq *cq, void *ch_ptr);
+static void srp_recv_done(struct ib_cq *cq, struct ib_wc *wc);
+static void srp_handle_qp_err(struct ib_cq *cq, struct ib_wc *wc,
+               const char *opname);
 static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event);
 
 static struct scsi_transport_template *ib_srp_transport_template;
@@ -445,6 +446,17 @@ static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target)
                                  dev->max_pages_per_mr);
 }
 
+static void srp_drain_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+       struct srp_rdma_ch *ch = cq->cq_context;
+
+       complete(&ch->done);
+}
+
+static struct ib_cqe srp_drain_cqe = {
+       .done           = srp_drain_done,
+};
+
 /**
  * srp_destroy_qp() - destroy an RDMA queue pair
  * @ch: SRP RDMA channel.
@@ -457,10 +469,11 @@ static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target)
 static void srp_destroy_qp(struct srp_rdma_ch *ch)
 {
        static struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
-       static struct ib_recv_wr wr = { .wr_id = SRP_LAST_WR_ID };
+       static struct ib_recv_wr wr = { 0 };
        struct ib_recv_wr *bad_wr;
        int ret;
 
+       wr.wr_cqe = &srp_drain_cqe;
        /* Destroying a QP and reusing ch->done is only safe if not connected */
        WARN_ON_ONCE(ch->connected);
 
@@ -489,34 +502,27 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
        struct ib_fmr_pool *fmr_pool = NULL;
        struct srp_fr_pool *fr_pool = NULL;
        const int m = dev->use_fast_reg ? 3 : 1;
-       struct ib_cq_init_attr cq_attr = {};
        int ret;
 
        init_attr = kzalloc(sizeof *init_attr, GFP_KERNEL);
        if (!init_attr)
                return -ENOMEM;
 
-       /* + 1 for SRP_LAST_WR_ID */
-       cq_attr.cqe = target->queue_size + 1;
-       cq_attr.comp_vector = ch->comp_vector;
-       recv_cq = ib_create_cq(dev->dev, srp_recv_completion, NULL, ch,
-                              &cq_attr);
+       /* queue_size + 1 for ib_drain_qp */
+       recv_cq = ib_alloc_cq(dev->dev, ch, target->queue_size + 1,
+                               ch->comp_vector, IB_POLL_SOFTIRQ);
        if (IS_ERR(recv_cq)) {
                ret = PTR_ERR(recv_cq);
                goto err;
        }
 
-       cq_attr.cqe = m * target->queue_size;
-       cq_attr.comp_vector = ch->comp_vector;
-       send_cq = ib_create_cq(dev->dev, srp_send_completion, NULL, ch,
-                              &cq_attr);
+       send_cq = ib_alloc_cq(dev->dev, ch, m * target->queue_size,
+                               ch->comp_vector, IB_POLL_DIRECT);
        if (IS_ERR(send_cq)) {
                ret = PTR_ERR(send_cq);
                goto err_recv_cq;
        }
 
-       ib_req_notify_cq(recv_cq, IB_CQ_NEXT_COMP);
-
        init_attr->event_handler       = srp_qp_event;
        init_attr->cap.max_send_wr     = m * target->queue_size;
        init_attr->cap.max_recv_wr     = target->queue_size + 1;
@@ -558,9 +564,9 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
        if (ch->qp)
                srp_destroy_qp(ch);
        if (ch->recv_cq)
-               ib_destroy_cq(ch->recv_cq);
+               ib_free_cq(ch->recv_cq);
        if (ch->send_cq)
-               ib_destroy_cq(ch->send_cq);
+               ib_free_cq(ch->send_cq);
 
        ch->qp = qp;
        ch->recv_cq = recv_cq;
@@ -580,13 +586,13 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
        return 0;
 
 err_qp:
-       ib_destroy_qp(qp);
+       srp_destroy_qp(ch);
 
 err_send_cq:
-       ib_destroy_cq(send_cq);
+       ib_free_cq(send_cq);
 
 err_recv_cq:
-       ib_destroy_cq(recv_cq);
+       ib_free_cq(recv_cq);
 
 err:
        kfree(init_attr);
@@ -622,9 +628,10 @@ static void srp_free_ch_ib(struct srp_target_port *target,
                if (ch->fmr_pool)
                        ib_destroy_fmr_pool(ch->fmr_pool);
        }
+
        srp_destroy_qp(ch);
-       ib_destroy_cq(ch->send_cq);
-       ib_destroy_cq(ch->recv_cq);
+       ib_free_cq(ch->send_cq);
+       ib_free_cq(ch->recv_cq);
 
        /*
         * Avoid that the SCSI error handler tries to use this channel after
@@ -1041,18 +1048,25 @@ out:
        return ret <= 0 ? ret : -ENODEV;
 }
 
-static int srp_inv_rkey(struct srp_rdma_ch *ch, u32 rkey)
+static void srp_inv_rkey_err_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+       srp_handle_qp_err(cq, wc, "INV RKEY");
+}
+
+static int srp_inv_rkey(struct srp_request *req, struct srp_rdma_ch *ch,
+               u32 rkey)
 {
        struct ib_send_wr *bad_wr;
        struct ib_send_wr wr = {
                .opcode             = IB_WR_LOCAL_INV,
-               .wr_id              = LOCAL_INV_WR_ID_MASK,
                .next               = NULL,
                .num_sge            = 0,
                .send_flags         = 0,
                .ex.invalidate_rkey = rkey,
        };
 
+       wr.wr_cqe = &req->reg_cqe;
+       req->reg_cqe.done = srp_inv_rkey_err_done;
        return ib_post_send(ch->qp, &wr, &bad_wr);
 }
 
@@ -1074,7 +1088,7 @@ static void srp_unmap_data(struct scsi_cmnd *scmnd,
                struct srp_fr_desc **pfr;
 
                for (i = req->nmdesc, pfr = req->fr_list; i > 0; i--, pfr++) {
-                       res = srp_inv_rkey(ch, (*pfr)->mr->rkey);
+                       res = srp_inv_rkey(req, ch, (*pfr)->mr->rkey);
                        if (res < 0) {
                                shost_printk(KERN_ERR, target->scsi_host, PFX
                                  "Queueing INV WR for rkey %#x failed (%d)\n",
@@ -1312,7 +1326,13 @@ reset_state:
        return 0;
 }
 
+static void srp_reg_mr_err_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+       srp_handle_qp_err(cq, wc, "FAST REG");
+}
+
 static int srp_map_finish_fr(struct srp_map_state *state,
+                            struct srp_request *req,
                             struct srp_rdma_ch *ch, int sg_nents)
 {
        struct srp_target_port *target = ch->target;
@@ -1349,9 +1369,11 @@ static int srp_map_finish_fr(struct srp_map_state *state,
        if (unlikely(n < 0))
                return n;
 
+       req->reg_cqe.done = srp_reg_mr_err_done;
+
        wr.wr.next = NULL;
        wr.wr.opcode = IB_WR_REG_MR;
-       wr.wr.wr_id = FAST_REG_WR_ID_MASK;
+       wr.wr.wr_cqe = &req->reg_cqe;
        wr.wr.num_sge = 0;
        wr.wr.send_flags = 0;
        wr.mr = desc->mr;
@@ -1455,7 +1477,7 @@ static int srp_map_sg_fr(struct srp_map_state *state, struct srp_rdma_ch *ch,
        while (count) {
                int i, n;
 
-               n = srp_map_finish_fr(state, ch, count);
+               n = srp_map_finish_fr(state, req, ch, count);
                if (unlikely(n < 0))
                        return n;
 
@@ -1524,7 +1546,7 @@ static int srp_map_idb(struct srp_rdma_ch *ch, struct srp_request *req,
 #ifdef CONFIG_NEED_SG_DMA_LENGTH
                idb_sg->dma_length = idb_sg->length;          /* hack^2 */
 #endif
-               ret = srp_map_finish_fr(&state, ch, 1);
+               ret = srp_map_finish_fr(&state, req, ch, 1);
                if (ret < 0)
                        return ret;
        } else if (dev->use_fmr) {
@@ -1719,7 +1741,7 @@ static struct srp_iu *__srp_get_tx_iu(struct srp_rdma_ch *ch,
        s32 rsv = (iu_type == SRP_IU_TSK_MGMT) ? 0 : SRP_TSK_MGMT_SQ_SIZE;
        struct srp_iu *iu;
 
-       srp_send_completion(ch->send_cq, ch);
+       ib_process_cq_direct(ch->send_cq, -1);
 
        if (list_empty(&ch->free_tx))
                return NULL;
@@ -1739,6 +1761,19 @@ static struct srp_iu *__srp_get_tx_iu(struct srp_rdma_ch *ch,
        return iu;
 }
 
+static void srp_send_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+       struct srp_iu *iu = container_of(wc->wr_cqe, struct srp_iu, cqe);
+       struct srp_rdma_ch *ch = cq->cq_context;
+
+       if (unlikely(wc->status != IB_WC_SUCCESS)) {
+               srp_handle_qp_err(cq, wc, "SEND");
+               return;
+       }
+
+       list_add(&iu->list, &ch->free_tx);
+}
+
 static int srp_post_send(struct srp_rdma_ch *ch, struct srp_iu *iu, int len)
 {
        struct srp_target_port *target = ch->target;
@@ -1749,8 +1784,10 @@ static int srp_post_send(struct srp_rdma_ch *ch, struct srp_iu *iu, int len)
        list.length = len;
        list.lkey   = target->lkey;
 
+       iu->cqe.done = srp_send_done;
+
        wr.next       = NULL;
-       wr.wr_id      = (uintptr_t) iu;
+       wr.wr_cqe     = &iu->cqe;
        wr.sg_list    = &list;
        wr.num_sge    = 1;
        wr.opcode     = IB_WR_SEND;
@@ -1769,8 +1806,10 @@ static int srp_post_recv(struct srp_rdma_ch *ch, struct srp_iu *iu)
        list.length = iu->size;
        list.lkey   = target->lkey;
 
+       iu->cqe.done = srp_recv_done;
+
        wr.next     = NULL;
-       wr.wr_id    = (uintptr_t) iu;
+       wr.wr_cqe   = &iu->cqe;
        wr.sg_list  = &list;
        wr.num_sge  = 1;
 
@@ -1902,14 +1941,20 @@ static void srp_process_aer_req(struct srp_rdma_ch *ch,
                             "problems processing SRP_AER_REQ\n");
 }
 
-static void srp_handle_recv(struct srp_rdma_ch *ch, struct ib_wc *wc)
+static void srp_recv_done(struct ib_cq *cq, struct ib_wc *wc)
 {
+       struct srp_iu *iu = container_of(wc->wr_cqe, struct srp_iu, cqe);
+       struct srp_rdma_ch *ch = cq->cq_context;
        struct srp_target_port *target = ch->target;
        struct ib_device *dev = target->srp_host->srp_dev->dev;
-       struct srp_iu *iu = (struct srp_iu *) (uintptr_t) wc->wr_id;
        int res;
        u8 opcode;
 
+       if (unlikely(wc->status != IB_WC_SUCCESS)) {
+               srp_handle_qp_err(cq, wc, "RECV");
+               return;
+       }
+
        ib_dma_sync_single_for_cpu(dev, iu->dma, ch->max_ti_iu_len,
                                   DMA_FROM_DEVICE);
 
@@ -1972,68 +2017,22 @@ static void srp_tl_err_work(struct work_struct *work)
                srp_start_tl_fail_timers(target->rport);
 }
 
-static void srp_handle_qp_err(u64 wr_id, enum ib_wc_status wc_status,
-                             bool send_err, struct srp_rdma_ch *ch)
+static void srp_handle_qp_err(struct ib_cq *cq, struct ib_wc *wc,
+               const char *opname)
 {
+       struct srp_rdma_ch *ch = cq->cq_context;
        struct srp_target_port *target = ch->target;
 
-       if (wr_id == SRP_LAST_WR_ID) {
-               complete(&ch->done);
-               return;
-       }
-
        if (ch->connected && !target->qp_in_error) {
-               if (wr_id & LOCAL_INV_WR_ID_MASK) {
-                       shost_printk(KERN_ERR, target->scsi_host, PFX
-                                    "LOCAL_INV failed with status %s (%d)\n",
-                                    ib_wc_status_msg(wc_status), wc_status);
-               } else if (wr_id & FAST_REG_WR_ID_MASK) {
-                       shost_printk(KERN_ERR, target->scsi_host, PFX
-                                    "FAST_REG_MR failed status %s (%d)\n",
-                                    ib_wc_status_msg(wc_status), wc_status);
-               } else {
-                       shost_printk(KERN_ERR, target->scsi_host,
-                                    PFX "failed %s status %s (%d) for iu %p\n",
-                                    send_err ? "send" : "receive",
-                                    ib_wc_status_msg(wc_status), wc_status,
-                                    (void *)(uintptr_t)wr_id);
-               }
+               shost_printk(KERN_ERR, target->scsi_host,
+                            PFX "failed %s status %s (%d) for CQE %p\n",
+                            opname, ib_wc_status_msg(wc->status), wc->status,
+                            wc->wr_cqe);
                queue_work(system_long_wq, &target->tl_err_work);
        }
        target->qp_in_error = true;
 }
 
-static void srp_recv_completion(struct ib_cq *cq, void *ch_ptr)
-{
-       struct srp_rdma_ch *ch = ch_ptr;
-       struct ib_wc wc;
-
-       ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
-       while (ib_poll_cq(cq, 1, &wc) > 0) {
-               if (likely(wc.status == IB_WC_SUCCESS)) {
-                       srp_handle_recv(ch, &wc);
-               } else {
-                       srp_handle_qp_err(wc.wr_id, wc.status, false, ch);
-               }
-       }
-}
-
-static void srp_send_completion(struct ib_cq *cq, void *ch_ptr)
-{
-       struct srp_rdma_ch *ch = ch_ptr;
-       struct ib_wc wc;
-       struct srp_iu *iu;
-
-       while (ib_poll_cq(cq, 1, &wc) > 0) {
-               if (likely(wc.status == IB_WC_SUCCESS)) {
-                       iu = (struct srp_iu *) (uintptr_t) wc.wr_id;
-                       list_add(&iu->list, &ch->free_tx);
-               } else {
-                       srp_handle_qp_err(wc.wr_id, wc.status, true, ch);
-               }
-       }
-}
-
 static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd)
 {
        struct srp_target_port *target = host_to_target(shost);
@@ -3439,27 +3438,17 @@ free_host:
 static void srp_add_one(struct ib_device *device)
 {
        struct srp_device *srp_dev;
-       struct ib_device_attr *dev_attr;
        struct srp_host *host;
        int mr_page_shift, p;
        u64 max_pages_per_mr;
 
-       dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
-       if (!dev_attr)
-               return;
-
-       if (ib_query_device(device, dev_attr)) {
-               pr_warn("Query device failed for %s\n", device->name);
-               goto free_attr;
-       }
-
        srp_dev = kmalloc(sizeof *srp_dev, GFP_KERNEL);
        if (!srp_dev)
-               goto free_attr;
+               return;
 
        srp_dev->has_fmr = (device->alloc_fmr && device->dealloc_fmr &&
                            device->map_phys_fmr && device->unmap_fmr);
-       srp_dev->has_fr = (dev_attr->device_cap_flags &
+       srp_dev->has_fr = (device->attrs.device_cap_flags &
                           IB_DEVICE_MEM_MGT_EXTENSIONS);
        if (!srp_dev->has_fmr && !srp_dev->has_fr)
                dev_warn(&device->dev, "neither FMR nor FR is supported\n");
@@ -3473,23 +3462,23 @@ static void srp_add_one(struct ib_device *device)
         * minimum of 4096 bytes. We're unlikely to build large sglists
         * out of smaller entries.
         */
-       mr_page_shift           = max(12, ffs(dev_attr->page_size_cap) - 1);
+       mr_page_shift           = max(12, ffs(device->attrs.page_size_cap) - 1);
        srp_dev->mr_page_size   = 1 << mr_page_shift;
        srp_dev->mr_page_mask   = ~((u64) srp_dev->mr_page_size - 1);
-       max_pages_per_mr        = dev_attr->max_mr_size;
+       max_pages_per_mr        = device->attrs.max_mr_size;
        do_div(max_pages_per_mr, srp_dev->mr_page_size);
        srp_dev->max_pages_per_mr = min_t(u64, SRP_MAX_PAGES_PER_MR,
                                          max_pages_per_mr);
        if (srp_dev->use_fast_reg) {
                srp_dev->max_pages_per_mr =
                        min_t(u32, srp_dev->max_pages_per_mr,
-                             dev_attr->max_fast_reg_page_list_len);
+                             device->attrs.max_fast_reg_page_list_len);
        }
        srp_dev->mr_max_size    = srp_dev->mr_page_size *
                                   srp_dev->max_pages_per_mr;
-       pr_debug("%s: mr_page_shift = %d, dev_attr->max_mr_size = %#llx, dev_attr->max_fast_reg_page_list_len = %u, max_pages_per_mr = %d, mr_max_size = %#x\n",
-                device->name, mr_page_shift, dev_attr->max_mr_size,
-                dev_attr->max_fast_reg_page_list_len,
+       pr_debug("%s: mr_page_shift = %d, device->max_mr_size = %#llx, device->max_fast_reg_page_list_len = %u, max_pages_per_mr = %d, mr_max_size = %#x\n",
+                device->name, mr_page_shift, device->attrs.max_mr_size,
+                device->attrs.max_fast_reg_page_list_len,
                 srp_dev->max_pages_per_mr, srp_dev->mr_max_size);
 
        INIT_LIST_HEAD(&srp_dev->dev_list);
@@ -3517,17 +3506,13 @@ static void srp_add_one(struct ib_device *device)
        }
 
        ib_set_client_data(device, &srp_client, srp_dev);
-
-       goto free_attr;
+       return;
 
 err_pd:
        ib_dealloc_pd(srp_dev->pd);
 
 free_dev:
        kfree(srp_dev);
-
-free_attr:
-       kfree(dev_attr);
 }
 
 static void srp_remove_one(struct ib_device *device, void *client_data)
@@ -3587,8 +3572,6 @@ static int __init srp_init_module(void)
 {
        int ret;
 
-       BUILD_BUG_ON(FIELD_SIZEOF(struct ib_wc, wr_id) < sizeof(void *));
-
        if (srp_sg_tablesize) {
                pr_warn("srp_sg_tablesize is deprecated, please use cmd_sg_entries\n");
                if (!cmd_sg_entries)
index f6af531f9f32c990cd30c7139d247c643144e55d..9e05ce4a04fd08b0a450f5a546b7fb51901c5afe 100644 (file)
@@ -66,11 +66,6 @@ enum {
        SRP_TAG_TSK_MGMT        = 1U << 31,
 
        SRP_MAX_PAGES_PER_MR    = 512,
-
-       LOCAL_INV_WR_ID_MASK    = 1,
-       FAST_REG_WR_ID_MASK     = 2,
-
-       SRP_LAST_WR_ID          = 0xfffffffcU,
 };
 
 enum srp_target_state {
@@ -128,6 +123,7 @@ struct srp_request {
        struct srp_direct_buf  *indirect_desc;
        dma_addr_t              indirect_dma_addr;
        short                   nmdesc;
+       struct ib_cqe           reg_cqe;
 };
 
 /**
@@ -231,6 +227,7 @@ struct srp_iu {
        void                   *buf;
        size_t                  size;
        enum dma_data_direction direction;
+       struct ib_cqe           cqe;
 };
 
 /**
index bc5470c43d26080c5fa0f9d0e6427e848ae2771a..0c37fee363b1d60eb3dd63ebfdc6330b2e14b562 100644 (file)
@@ -93,6 +93,8 @@ MODULE_PARM_DESC(srpt_service_guid,
 static struct ib_client srpt_client;
 static void srpt_release_channel(struct srpt_rdma_ch *ch);
 static int srpt_queue_status(struct se_cmd *cmd);
+static void srpt_recv_done(struct ib_cq *cq, struct ib_wc *wc);
+static void srpt_send_done(struct ib_cq *cq, struct ib_wc *wc);
 
 /**
  * opposite_dma_dir() - Swap DMA_TO_DEVICE and DMA_FROM_DEVICE.
@@ -341,10 +343,10 @@ static void srpt_get_ioc(struct srpt_port *sport, u32 slot,
        memset(iocp, 0, sizeof *iocp);
        strcpy(iocp->id_string, SRPT_ID_STRING);
        iocp->guid = cpu_to_be64(srpt_service_guid);
-       iocp->vendor_id = cpu_to_be32(sdev->dev_attr.vendor_id);
-       iocp->device_id = cpu_to_be32(sdev->dev_attr.vendor_part_id);
-       iocp->device_version = cpu_to_be16(sdev->dev_attr.hw_ver);
-       iocp->subsys_vendor_id = cpu_to_be32(sdev->dev_attr.vendor_id);
+       iocp->vendor_id = cpu_to_be32(sdev->device->attrs.vendor_id);
+       iocp->device_id = cpu_to_be32(sdev->device->attrs.vendor_part_id);
+       iocp->device_version = cpu_to_be16(sdev->device->attrs.hw_ver);
+       iocp->subsys_vendor_id = cpu_to_be32(sdev->device->attrs.vendor_id);
        iocp->subsys_device_id = 0x0;
        iocp->io_class = cpu_to_be16(SRP_REV16A_IB_IO_CLASS);
        iocp->io_subclass = cpu_to_be16(SRP_IO_SUBCLASS);
@@ -453,6 +455,7 @@ static void srpt_mad_send_handler(struct ib_mad_agent *mad_agent,
  * srpt_mad_recv_handler() - MAD reception callback function.
  */
 static void srpt_mad_recv_handler(struct ib_mad_agent *mad_agent,
+                                 struct ib_mad_send_buf *send_buf,
                                  struct ib_mad_recv_wc *mad_wc)
 {
        struct srpt_port *sport = (struct srpt_port *)mad_agent->context;
@@ -778,12 +781,12 @@ static int srpt_post_recv(struct srpt_device *sdev,
        struct ib_recv_wr wr, *bad_wr;
 
        BUG_ON(!sdev);
-       wr.wr_id = encode_wr_id(SRPT_RECV, ioctx->ioctx.index);
-
        list.addr = ioctx->ioctx.dma;
        list.length = srp_max_req_size;
        list.lkey = sdev->pd->local_dma_lkey;
 
+       ioctx->ioctx.cqe.done = srpt_recv_done;
+       wr.wr_cqe = &ioctx->ioctx.cqe;
        wr.next = NULL;
        wr.sg_list = &list;
        wr.num_sge = 1;
@@ -819,8 +822,9 @@ static int srpt_post_send(struct srpt_rdma_ch *ch,
        list.length = len;
        list.lkey = sdev->pd->local_dma_lkey;
 
+       ioctx->ioctx.cqe.done = srpt_send_done;
        wr.next = NULL;
-       wr.wr_id = encode_wr_id(SRPT_SEND, ioctx->ioctx.index);
+       wr.wr_cqe = &ioctx->ioctx.cqe;
        wr.sg_list = &list;
        wr.num_sge = 1;
        wr.opcode = IB_WR_SEND;
@@ -1052,13 +1056,13 @@ static void srpt_unmap_sg_to_ib_sge(struct srpt_rdma_ch *ch,
 
        BUG_ON(!ch);
        BUG_ON(!ioctx);
-       BUG_ON(ioctx->n_rdma && !ioctx->rdma_ius);
+       BUG_ON(ioctx->n_rdma && !ioctx->rdma_wrs);
 
        while (ioctx->n_rdma)
-               kfree(ioctx->rdma_ius[--ioctx->n_rdma].sge);
+               kfree(ioctx->rdma_wrs[--ioctx->n_rdma].wr.sg_list);
 
-       kfree(ioctx->rdma_ius);
-       ioctx->rdma_ius = NULL;
+       kfree(ioctx->rdma_wrs);
+       ioctx->rdma_wrs = NULL;
 
        if (ioctx->mapped_sg_count) {
                sg = ioctx->sg;
@@ -1082,7 +1086,7 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
        struct scatterlist *sg, *sg_orig;
        int sg_cnt;
        enum dma_data_direction dir;
-       struct rdma_iu *riu;
+       struct ib_rdma_wr *riu;
        struct srp_direct_buf *db;
        dma_addr_t dma_addr;
        struct ib_sge *sge;
@@ -1109,23 +1113,24 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
 
        ioctx->mapped_sg_count = count;
 
-       if (ioctx->rdma_ius && ioctx->n_rdma_ius)
-               nrdma = ioctx->n_rdma_ius;
+       if (ioctx->rdma_wrs && ioctx->n_rdma_wrs)
+               nrdma = ioctx->n_rdma_wrs;
        else {
                nrdma = (count + SRPT_DEF_SG_PER_WQE - 1) / SRPT_DEF_SG_PER_WQE
                        + ioctx->n_rbuf;
 
-               ioctx->rdma_ius = kzalloc(nrdma * sizeof *riu, GFP_KERNEL);
-               if (!ioctx->rdma_ius)
+               ioctx->rdma_wrs = kcalloc(nrdma, sizeof(*ioctx->rdma_wrs),
+                               GFP_KERNEL);
+               if (!ioctx->rdma_wrs)
                        goto free_mem;
 
-               ioctx->n_rdma_ius = nrdma;
+               ioctx->n_rdma_wrs = nrdma;
        }
 
        db = ioctx->rbufs;
        tsize = cmd->data_length;
        dma_len = ib_sg_dma_len(dev, &sg[0]);
-       riu = ioctx->rdma_ius;
+       riu = ioctx->rdma_wrs;
 
        /*
         * For each remote desc - calculate the #ib_sge.
@@ -1139,9 +1144,9 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
             j < count && i < ioctx->n_rbuf && tsize > 0; ++i, ++riu, ++db) {
                rsize = be32_to_cpu(db->len);
                raddr = be64_to_cpu(db->va);
-               riu->raddr = raddr;
+               riu->remote_addr = raddr;
                riu->rkey = be32_to_cpu(db->key);
-               riu->sge_cnt = 0;
+               riu->wr.num_sge = 0;
 
                /* calculate how many sge required for this remote_buf */
                while (rsize > 0 && tsize > 0) {
@@ -1165,33 +1170,35 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
                                rsize = 0;
                        }
 
-                       ++riu->sge_cnt;
+                       ++riu->wr.num_sge;
 
-                       if (rsize > 0 && riu->sge_cnt == SRPT_DEF_SG_PER_WQE) {
+                       if (rsize > 0 &&
+                           riu->wr.num_sge == SRPT_DEF_SG_PER_WQE) {
                                ++ioctx->n_rdma;
-                               riu->sge =
-                                   kmalloc(riu->sge_cnt * sizeof *riu->sge,
-                                           GFP_KERNEL);
-                               if (!riu->sge)
+                               riu->wr.sg_list = kmalloc_array(riu->wr.num_sge,
+                                               sizeof(*riu->wr.sg_list),
+                                               GFP_KERNEL);
+                               if (!riu->wr.sg_list)
                                        goto free_mem;
 
                                ++riu;
-                               riu->sge_cnt = 0;
-                               riu->raddr = raddr;
+                               riu->wr.num_sge = 0;
+                               riu->remote_addr = raddr;
                                riu->rkey = be32_to_cpu(db->key);
                        }
                }
 
                ++ioctx->n_rdma;
-               riu->sge = kmalloc(riu->sge_cnt * sizeof *riu->sge,
-                                  GFP_KERNEL);
-               if (!riu->sge)
+               riu->wr.sg_list = kmalloc_array(riu->wr.num_sge,
+                                       sizeof(*riu->wr.sg_list),
+                                       GFP_KERNEL);
+               if (!riu->wr.sg_list)
                        goto free_mem;
        }
 
        db = ioctx->rbufs;
        tsize = cmd->data_length;
-       riu = ioctx->rdma_ius;
+       riu = ioctx->rdma_wrs;
        sg = sg_orig;
        dma_len = ib_sg_dma_len(dev, &sg[0]);
        dma_addr = ib_sg_dma_address(dev, &sg[0]);
@@ -1200,7 +1207,7 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
        for (i = 0, j = 0;
             j < count && i < ioctx->n_rbuf && tsize > 0; ++i, ++riu, ++db) {
                rsize = be32_to_cpu(db->len);
-               sge = riu->sge;
+               sge = riu->wr.sg_list;
                k = 0;
 
                while (rsize > 0 && tsize > 0) {
@@ -1232,9 +1239,9 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
                        }
 
                        ++k;
-                       if (k == riu->sge_cnt && rsize > 0 && tsize > 0) {
+                       if (k == riu->wr.num_sge && rsize > 0 && tsize > 0) {
                                ++riu;
-                               sge = riu->sge;
+                               sge = riu->wr.sg_list;
                                k = 0;
                        } else if (rsize > 0 && tsize > 0)
                                ++sge;
@@ -1277,8 +1284,8 @@ static struct srpt_send_ioctx *srpt_get_send_ioctx(struct srpt_rdma_ch *ch)
        ioctx->n_rbuf = 0;
        ioctx->rbufs = NULL;
        ioctx->n_rdma = 0;
-       ioctx->n_rdma_ius = 0;
-       ioctx->rdma_ius = NULL;
+       ioctx->n_rdma_wrs = 0;
+       ioctx->rdma_wrs = NULL;
        ioctx->mapped_sg_count = 0;
        init_completion(&ioctx->tx_done);
        ioctx->queue_status_only = false;
@@ -1380,118 +1387,44 @@ out:
 }
 
 /**
- * srpt_handle_send_err_comp() - Process an IB_WC_SEND error completion.
- */
-static void srpt_handle_send_err_comp(struct srpt_rdma_ch *ch, u64 wr_id)
-{
-       struct srpt_send_ioctx *ioctx;
-       enum srpt_command_state state;
-       u32 index;
-
-       atomic_inc(&ch->sq_wr_avail);
-
-       index = idx_from_wr_id(wr_id);
-       ioctx = ch->ioctx_ring[index];
-       state = srpt_get_cmd_state(ioctx);
-
-       WARN_ON(state != SRPT_STATE_CMD_RSP_SENT
-               && state != SRPT_STATE_MGMT_RSP_SENT
-               && state != SRPT_STATE_NEED_DATA
-               && state != SRPT_STATE_DONE);
-
-       /* If SRP_RSP sending failed, undo the ch->req_lim change. */
-       if (state == SRPT_STATE_CMD_RSP_SENT
-           || state == SRPT_STATE_MGMT_RSP_SENT)
-               atomic_dec(&ch->req_lim);
-
-       srpt_abort_cmd(ioctx);
-}
-
-/**
- * srpt_handle_send_comp() - Process an IB send completion notification.
- */
-static void srpt_handle_send_comp(struct srpt_rdma_ch *ch,
-                                 struct srpt_send_ioctx *ioctx)
-{
-       enum srpt_command_state state;
-
-       atomic_inc(&ch->sq_wr_avail);
-
-       state = srpt_set_cmd_state(ioctx, SRPT_STATE_DONE);
-
-       if (WARN_ON(state != SRPT_STATE_CMD_RSP_SENT
-                   && state != SRPT_STATE_MGMT_RSP_SENT
-                   && state != SRPT_STATE_DONE))
-               pr_debug("state = %d\n", state);
-
-       if (state != SRPT_STATE_DONE) {
-               srpt_unmap_sg_to_ib_sge(ch, ioctx);
-               transport_generic_free_cmd(&ioctx->cmd, 0);
-       } else {
-               pr_err("IB completion has been received too late for"
-                      " wr_id = %u.\n", ioctx->ioctx.index);
-       }
-}
-
-/**
- * srpt_handle_rdma_comp() - Process an IB RDMA completion notification.
- *
  * XXX: what is now target_execute_cmd used to be asynchronous, and unmapping
  * the data that has been transferred via IB RDMA had to be postponed until the
  * check_stop_free() callback.  None of this is necessary anymore and needs to
  * be cleaned up.
  */
-static void srpt_handle_rdma_comp(struct srpt_rdma_ch *ch,
-                                 struct srpt_send_ioctx *ioctx,
-                                 enum srpt_opcode opcode)
+static void srpt_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc)
 {
+       struct srpt_rdma_ch *ch = cq->cq_context;
+       struct srpt_send_ioctx *ioctx =
+               container_of(wc->wr_cqe, struct srpt_send_ioctx, rdma_cqe);
+
        WARN_ON(ioctx->n_rdma <= 0);
        atomic_add(ioctx->n_rdma, &ch->sq_wr_avail);
 
-       if (opcode == SRPT_RDMA_READ_LAST) {
-               if (srpt_test_and_set_cmd_state(ioctx, SRPT_STATE_NEED_DATA,
-                                               SRPT_STATE_DATA_IN))
-                       target_execute_cmd(&ioctx->cmd);
-               else
-                       pr_err("%s[%d]: wrong state = %d\n", __func__,
-                              __LINE__, srpt_get_cmd_state(ioctx));
-       } else if (opcode == SRPT_RDMA_ABORT) {
-               ioctx->rdma_aborted = true;
-       } else {
-               WARN(true, "unexpected opcode %d\n", opcode);
+       if (unlikely(wc->status != IB_WC_SUCCESS)) {
+               pr_info("RDMA_READ for ioctx 0x%p failed with status %d\n",
+                       ioctx, wc->status);
+               srpt_abort_cmd(ioctx);
+               return;
        }
+
+       if (srpt_test_and_set_cmd_state(ioctx, SRPT_STATE_NEED_DATA,
+                                       SRPT_STATE_DATA_IN))
+               target_execute_cmd(&ioctx->cmd);
+       else
+               pr_err("%s[%d]: wrong state = %d\n", __func__,
+                      __LINE__, srpt_get_cmd_state(ioctx));
 }
 
-/**
- * srpt_handle_rdma_err_comp() - Process an IB RDMA error completion.
- */
-static void srpt_handle_rdma_err_comp(struct srpt_rdma_ch *ch,
-                                     struct srpt_send_ioctx *ioctx,
-                                     enum srpt_opcode opcode)
+static void srpt_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
 {
-       enum srpt_command_state state;
+       struct srpt_send_ioctx *ioctx =
+               container_of(wc->wr_cqe, struct srpt_send_ioctx, rdma_cqe);
 
-       state = srpt_get_cmd_state(ioctx);
-       switch (opcode) {
-       case SRPT_RDMA_READ_LAST:
-               if (ioctx->n_rdma <= 0) {
-                       pr_err("Received invalid RDMA read"
-                              " error completion with idx %d\n",
-                              ioctx->ioctx.index);
-                       break;
-               }
-               atomic_add(ioctx->n_rdma, &ch->sq_wr_avail);
-               if (state == SRPT_STATE_NEED_DATA)
-                       srpt_abort_cmd(ioctx);
-               else
-                       pr_err("%s[%d]: wrong state = %d\n",
-                              __func__, __LINE__, state);
-               break;
-       case SRPT_RDMA_WRITE_LAST:
-               break;
-       default:
-               pr_err("%s[%d]: opcode = %u\n", __func__, __LINE__, opcode);
-               break;
+       if (unlikely(wc->status != IB_WC_SUCCESS)) {
+               pr_info("RDMA_WRITE for ioctx 0x%p failed with status %d\n",
+                       ioctx, wc->status);
+               srpt_abort_cmd(ioctx);
        }
 }
 
@@ -1926,32 +1859,26 @@ out:
        return;
 }
 
-static void srpt_process_rcv_completion(struct ib_cq *cq,
-                                       struct srpt_rdma_ch *ch,
-                                       struct ib_wc *wc)
+static void srpt_recv_done(struct ib_cq *cq, struct ib_wc *wc)
 {
-       struct srpt_device *sdev = ch->sport->sdev;
-       struct srpt_recv_ioctx *ioctx;
-       u32 index;
+       struct srpt_rdma_ch *ch = cq->cq_context;
+       struct srpt_recv_ioctx *ioctx =
+               container_of(wc->wr_cqe, struct srpt_recv_ioctx, ioctx.cqe);
 
-       index = idx_from_wr_id(wc->wr_id);
        if (wc->status == IB_WC_SUCCESS) {
                int req_lim;
 
                req_lim = atomic_dec_return(&ch->req_lim);
                if (unlikely(req_lim < 0))
                        pr_err("req_lim = %d < 0\n", req_lim);
-               ioctx = sdev->ioctx_ring[index];
                srpt_handle_new_iu(ch, ioctx, NULL);
        } else {
-               pr_info("receiving failed for idx %u with status %d\n",
-                       index, wc->status);
+               pr_info("receiving failed for ioctx %p with status %d\n",
+                       ioctx, wc->status);
        }
 }
 
 /**
- * srpt_process_send_completion() - Process an IB send completion.
- *
  * Note: Although this has not yet been observed during tests, at least in
  * theory it is possible that the srpt_get_send_ioctx() call invoked by
  * srpt_handle_new_iu() fails. This is possible because the req_lim_delta
@@ -1964,108 +1891,51 @@ static void srpt_process_rcv_completion(struct ib_cq *cq,
  * are queued on cmd_wait_list. The code below processes these delayed
  * requests one at a time.
  */
-static void srpt_process_send_completion(struct ib_cq *cq,
-                                        struct srpt_rdma_ch *ch,
-                                        struct ib_wc *wc)
+static void srpt_send_done(struct ib_cq *cq, struct ib_wc *wc)
 {
-       struct srpt_send_ioctx *send_ioctx;
-       uint32_t index;
-       enum srpt_opcode opcode;
+       struct srpt_rdma_ch *ch = cq->cq_context;
+       struct srpt_send_ioctx *ioctx =
+               container_of(wc->wr_cqe, struct srpt_send_ioctx, ioctx.cqe);
+       enum srpt_command_state state;
 
-       index = idx_from_wr_id(wc->wr_id);
-       opcode = opcode_from_wr_id(wc->wr_id);
-       send_ioctx = ch->ioctx_ring[index];
-       if (wc->status == IB_WC_SUCCESS) {
-               if (opcode == SRPT_SEND)
-                       srpt_handle_send_comp(ch, send_ioctx);
-               else {
-                       WARN_ON(opcode != SRPT_RDMA_ABORT &&
-                               wc->opcode != IB_WC_RDMA_READ);
-                       srpt_handle_rdma_comp(ch, send_ioctx, opcode);
-               }
+       state = srpt_set_cmd_state(ioctx, SRPT_STATE_DONE);
+
+       WARN_ON(state != SRPT_STATE_CMD_RSP_SENT &&
+               state != SRPT_STATE_MGMT_RSP_SENT);
+
+       atomic_inc(&ch->sq_wr_avail);
+
+       if (wc->status != IB_WC_SUCCESS) {
+               pr_info("sending response for ioctx 0x%p failed"
+                       " with status %d\n", ioctx, wc->status);
+
+               atomic_dec(&ch->req_lim);
+               srpt_abort_cmd(ioctx);
+               goto out;
+       }
+
+       if (state != SRPT_STATE_DONE) {
+               srpt_unmap_sg_to_ib_sge(ch, ioctx);
+               transport_generic_free_cmd(&ioctx->cmd, 0);
        } else {
-               if (opcode == SRPT_SEND) {
-                       pr_info("sending response for idx %u failed"
-                               " with status %d\n", index, wc->status);
-                       srpt_handle_send_err_comp(ch, wc->wr_id);
-               } else if (opcode != SRPT_RDMA_MID) {
-                       pr_info("RDMA t %d for idx %u failed with"
-                               " status %d\n", opcode, index, wc->status);
-                       srpt_handle_rdma_err_comp(ch, send_ioctx, opcode);
-               }
+               pr_err("IB completion has been received too late for"
+                      " wr_id = %u.\n", ioctx->ioctx.index);
        }
 
-       while (unlikely(opcode == SRPT_SEND
-                       && !list_empty(&ch->cmd_wait_list)
-                       && srpt_get_ch_state(ch) == CH_LIVE
-                       && (send_ioctx = srpt_get_send_ioctx(ch)) != NULL)) {
+out:
+       while (!list_empty(&ch->cmd_wait_list) &&
+              srpt_get_ch_state(ch) == CH_LIVE &&
+              (ioctx = srpt_get_send_ioctx(ch)) != NULL) {
                struct srpt_recv_ioctx *recv_ioctx;
 
                recv_ioctx = list_first_entry(&ch->cmd_wait_list,
                                              struct srpt_recv_ioctx,
                                              wait_list);
                list_del(&recv_ioctx->wait_list);
-               srpt_handle_new_iu(ch, recv_ioctx, send_ioctx);
-       }
-}
-
-static void srpt_process_completion(struct ib_cq *cq, struct srpt_rdma_ch *ch)
-{
-       struct ib_wc *const wc = ch->wc;
-       int i, n;
-
-       WARN_ON(cq != ch->cq);
-
-       ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
-       while ((n = ib_poll_cq(cq, ARRAY_SIZE(ch->wc), wc)) > 0) {
-               for (i = 0; i < n; i++) {
-                       if (opcode_from_wr_id(wc[i].wr_id) == SRPT_RECV)
-                               srpt_process_rcv_completion(cq, ch, &wc[i]);
-                       else
-                               srpt_process_send_completion(cq, ch, &wc[i]);
-               }
+               srpt_handle_new_iu(ch, recv_ioctx, ioctx);
        }
 }
 
-/**
- * srpt_completion() - IB completion queue callback function.
- *
- * Notes:
- * - It is guaranteed that a completion handler will never be invoked
- *   concurrently on two different CPUs for the same completion queue. See also
- *   Documentation/infiniband/core_locking.txt and the implementation of
- *   handle_edge_irq() in kernel/irq/chip.c.
- * - When threaded IRQs are enabled, completion handlers are invoked in thread
- *   context instead of interrupt context.
- */
-static void srpt_completion(struct ib_cq *cq, void *ctx)
-{
-       struct srpt_rdma_ch *ch = ctx;
-
-       wake_up_interruptible(&ch->wait_queue);
-}
-
-static int srpt_compl_thread(void *arg)
-{
-       struct srpt_rdma_ch *ch;
-
-       /* Hibernation / freezing of the SRPT kernel thread is not supported. */
-       current->flags |= PF_NOFREEZE;
-
-       ch = arg;
-       BUG_ON(!ch);
-       pr_info("Session %s: kernel thread %s (PID %d) started\n",
-               ch->sess_name, ch->thread->comm, current->pid);
-       while (!kthread_should_stop()) {
-               wait_event_interruptible(ch->wait_queue,
-                       (srpt_process_completion(ch->cq, ch),
-                        kthread_should_stop()));
-       }
-       pr_info("Session %s: kernel thread %s (PID %d) stopped\n",
-               ch->sess_name, ch->thread->comm, current->pid);
-       return 0;
-}
-
 /**
  * srpt_create_ch_ib() - Create receive and send completion queues.
  */
@@ -2075,7 +1945,6 @@ static int srpt_create_ch_ib(struct srpt_rdma_ch *ch)
        struct srpt_port *sport = ch->sport;
        struct srpt_device *sdev = sport->sdev;
        u32 srp_sq_size = sport->port_attrib.srp_sq_size;
-       struct ib_cq_init_attr cq_attr = {};
        int ret;
 
        WARN_ON(ch->rq_size < 1);
@@ -2086,9 +1955,8 @@ static int srpt_create_ch_ib(struct srpt_rdma_ch *ch)
                goto out;
 
 retry:
-       cq_attr.cqe = ch->rq_size + srp_sq_size;
-       ch->cq = ib_create_cq(sdev->device, srpt_completion, NULL, ch,
-                             &cq_attr);
+       ch->cq = ib_alloc_cq(sdev->device, ch, ch->rq_size + srp_sq_size,
+                       0 /* XXX: spread CQs */, IB_POLL_WORKQUEUE);
        if (IS_ERR(ch->cq)) {
                ret = PTR_ERR(ch->cq);
                pr_err("failed to create CQ cqe= %d ret= %d\n",
@@ -2131,18 +1999,6 @@ retry:
        if (ret)
                goto err_destroy_qp;
 
-       init_waitqueue_head(&ch->wait_queue);
-
-       pr_debug("creating thread for session %s\n", ch->sess_name);
-
-       ch->thread = kthread_run(srpt_compl_thread, ch, "ib_srpt_compl");
-       if (IS_ERR(ch->thread)) {
-               pr_err("failed to create kernel thread %ld\n",
-                      PTR_ERR(ch->thread));
-               ch->thread = NULL;
-               goto err_destroy_qp;
-       }
-
 out:
        kfree(qp_init);
        return ret;
@@ -2150,17 +2006,14 @@ out:
 err_destroy_qp:
        ib_destroy_qp(ch->qp);
 err_destroy_cq:
-       ib_destroy_cq(ch->cq);
+       ib_free_cq(ch->cq);
        goto out;
 }
 
 static void srpt_destroy_ch_ib(struct srpt_rdma_ch *ch)
 {
-       if (ch->thread)
-               kthread_stop(ch->thread);
-
        ib_destroy_qp(ch->qp);
-       ib_destroy_cq(ch->cq);
+       ib_free_cq(ch->cq);
 }
 
 /**
@@ -2808,12 +2661,8 @@ static int srpt_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
 static int srpt_perform_rdmas(struct srpt_rdma_ch *ch,
                              struct srpt_send_ioctx *ioctx)
 {
-       struct ib_rdma_wr wr;
        struct ib_send_wr *bad_wr;
-       struct rdma_iu *riu;
-       int i;
-       int ret;
-       int sq_wr_avail;
+       int sq_wr_avail, ret, i;
        enum dma_data_direction dir;
        const int n_rdma = ioctx->n_rdma;
 
@@ -2829,59 +2678,32 @@ static int srpt_perform_rdmas(struct srpt_rdma_ch *ch,
                }
        }
 
-       ioctx->rdma_aborted = false;
-       ret = 0;
-       riu = ioctx->rdma_ius;
-       memset(&wr, 0, sizeof wr);
-
-       for (i = 0; i < n_rdma; ++i, ++riu) {
-               if (dir == DMA_FROM_DEVICE) {
-                       wr.wr.opcode = IB_WR_RDMA_WRITE;
-                       wr.wr.wr_id = encode_wr_id(i == n_rdma - 1 ?
-                                               SRPT_RDMA_WRITE_LAST :
-                                               SRPT_RDMA_MID,
-                                               ioctx->ioctx.index);
-               } else {
-                       wr.wr.opcode = IB_WR_RDMA_READ;
-                       wr.wr.wr_id = encode_wr_id(i == n_rdma - 1 ?
-                                               SRPT_RDMA_READ_LAST :
-                                               SRPT_RDMA_MID,
-                                               ioctx->ioctx.index);
-               }
-               wr.wr.next = NULL;
-               wr.remote_addr = riu->raddr;
-               wr.rkey = riu->rkey;
-               wr.wr.num_sge = riu->sge_cnt;
-               wr.wr.sg_list = riu->sge;
+       for (i = 0; i < n_rdma; i++) {
+               struct ib_send_wr *wr = &ioctx->rdma_wrs[i].wr;
 
-               /* only get completion event for the last rdma write */
-               if (i == (n_rdma - 1) && dir == DMA_TO_DEVICE)
-                       wr.wr.send_flags = IB_SEND_SIGNALED;
+               wr->opcode = (dir == DMA_FROM_DEVICE) ?
+                               IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
 
-               ret = ib_post_send(ch->qp, &wr.wr, &bad_wr);
-               if (ret)
-                       break;
+               if (i == n_rdma - 1) {
+                       /* only get completion event for the last rdma read */
+                       if (dir == DMA_TO_DEVICE) {
+                               wr->send_flags = IB_SEND_SIGNALED;
+                               ioctx->rdma_cqe.done = srpt_rdma_read_done;
+                       } else {
+                               ioctx->rdma_cqe.done = srpt_rdma_write_done;
+                       }
+                       wr->wr_cqe = &ioctx->rdma_cqe;
+                       wr->next = NULL;
+               } else {
+                       wr->wr_cqe = NULL;
+                       wr->next = &ioctx->rdma_wrs[i + 1].wr;
+               }
        }
 
+       ret = ib_post_send(ch->qp, &ioctx->rdma_wrs->wr, &bad_wr);
        if (ret)
                pr_err("%s[%d]: ib_post_send() returned %d for %d/%d\n",
                                 __func__, __LINE__, ret, i, n_rdma);
-       if (ret && i > 0) {
-               wr.wr.num_sge = 0;
-               wr.wr.wr_id = encode_wr_id(SRPT_RDMA_ABORT, ioctx->ioctx.index);
-               wr.wr.send_flags = IB_SEND_SIGNALED;
-               while (ch->state == CH_LIVE &&
-                       ib_post_send(ch->qp, &wr.wr, &bad_wr) != 0) {
-                       pr_info("Trying to abort failed RDMA transfer [%d]\n",
-                               ioctx->ioctx.index);
-                       msleep(1000);
-               }
-               while (ch->state != CH_RELEASING && !ioctx->rdma_aborted) {
-                       pr_info("Waiting until RDMA abort finished [%d]\n",
-                               ioctx->ioctx.index);
-                       msleep(1000);
-               }
-       }
 out:
        if (unlikely(dir == DMA_TO_DEVICE && ret < 0))
                atomic_add(n_rdma, &ch->sq_wr_avail);
@@ -3190,14 +3012,11 @@ static void srpt_add_one(struct ib_device *device)
        init_waitqueue_head(&sdev->ch_releaseQ);
        spin_lock_init(&sdev->spinlock);
 
-       if (ib_query_device(device, &sdev->dev_attr))
-               goto free_dev;
-
        sdev->pd = ib_alloc_pd(device);
        if (IS_ERR(sdev->pd))
                goto free_dev;
 
-       sdev->srq_size = min(srpt_srq_size, sdev->dev_attr.max_srq_wr);
+       sdev->srq_size = min(srpt_srq_size, sdev->device->attrs.max_srq_wr);
 
        srq_attr.event_handler = srpt_srq_event;
        srq_attr.srq_context = (void *)sdev;
@@ -3211,7 +3030,7 @@ static void srpt_add_one(struct ib_device *device)
                goto err_pd;
 
        pr_debug("%s: create SRQ #wr= %d max_allow=%d dev= %s\n",
-                __func__, sdev->srq_size, sdev->dev_attr.max_srq_wr,
+                __func__, sdev->srq_size, sdev->device->attrs.max_srq_wr,
                 device->name);
 
        if (!srpt_service_guid)
index 5366e0a9fd6d42d298cf296ed70dd70c618e8de0..09037f2b0b51430d568e847e7ebd5f24e7fd906e 100644 (file)
@@ -128,36 +128,6 @@ enum {
        DEFAULT_MAX_RDMA_SIZE = 65536,
 };
 
-enum srpt_opcode {
-       SRPT_RECV,
-       SRPT_SEND,
-       SRPT_RDMA_MID,
-       SRPT_RDMA_ABORT,
-       SRPT_RDMA_READ_LAST,
-       SRPT_RDMA_WRITE_LAST,
-};
-
-static inline u64 encode_wr_id(u8 opcode, u32 idx)
-{
-       return ((u64)opcode << 32) | idx;
-}
-static inline enum srpt_opcode opcode_from_wr_id(u64 wr_id)
-{
-       return wr_id >> 32;
-}
-static inline u32 idx_from_wr_id(u64 wr_id)
-{
-       return (u32)wr_id;
-}
-
-struct rdma_iu {
-       u64             raddr;
-       u32             rkey;
-       struct ib_sge   *sge;
-       u32             sge_cnt;
-       int             mem_id;
-};
-
 /**
  * enum srpt_command_state - SCSI command state managed by SRPT.
  * @SRPT_STATE_NEW:           New command arrived and is being processed.
@@ -189,6 +159,7 @@ enum srpt_command_state {
  * @index: Index of the I/O context in its ioctx_ring array.
  */
 struct srpt_ioctx {
+       struct ib_cqe           cqe;
        void                    *buf;
        dma_addr_t              dma;
        uint32_t                index;
@@ -215,32 +186,30 @@ struct srpt_recv_ioctx {
  * @sg:          Pointer to sg-list associated with this I/O context.
  * @sg_cnt:      SG-list size.
  * @mapped_sg_count: ib_dma_map_sg() return value.
- * @n_rdma_ius:  Number of elements in the rdma_ius array.
- * @rdma_ius:    Array with information about the RDMA mapping.
+ * @n_rdma_wrs:  Number of elements in the rdma_wrs array.
+ * @rdma_wrs:    Array with information about the RDMA mapping.
  * @tag:         Tag of the received SRP information unit.
  * @spinlock:    Protects 'state'.
  * @state:       I/O context state.
- * @rdma_aborted: If initiating a multipart RDMA transfer failed, whether
- *              the already initiated transfers have finished.
  * @cmd:         Target core command data structure.
  * @sense_data:  SCSI sense data.
  */
 struct srpt_send_ioctx {
        struct srpt_ioctx       ioctx;
        struct srpt_rdma_ch     *ch;
-       struct rdma_iu          *rdma_ius;
+       struct ib_rdma_wr       *rdma_wrs;
+       struct ib_cqe           rdma_cqe;
        struct srp_direct_buf   *rbufs;
        struct srp_direct_buf   single_rbuf;
        struct scatterlist      *sg;
        struct list_head        free_list;
        spinlock_t              spinlock;
        enum srpt_command_state state;
-       bool                    rdma_aborted;
        struct se_cmd           cmd;
        struct completion       tx_done;
        int                     sg_cnt;
        int                     mapped_sg_count;
-       u16                     n_rdma_ius;
+       u16                     n_rdma_wrs;
        u8                      n_rdma;
        u8                      n_rbuf;
        bool                    queue_status_only;
@@ -267,9 +236,6 @@ enum rdma_ch_state {
 
 /**
  * struct srpt_rdma_ch - RDMA channel.
- * @wait_queue:    Allows the kernel thread to wait for more work.
- * @thread:        Kernel thread that processes the IB queues associated with
- *                 the channel.
  * @cm_id:         IB CM ID associated with the channel.
  * @qp:            IB queue pair used for communicating over this channel.
  * @cq:            IB completion queue for this channel.
@@ -288,7 +254,6 @@ enum rdma_ch_state {
  * @free_list:     Head of list with free send I/O contexts.
  * @state:         channel state. See also enum rdma_ch_state.
  * @ioctx_ring:    Send ring.
- * @wc:            IB work completion array for srpt_process_completion().
  * @list:          Node for insertion in the srpt_device.rch_list list.
  * @cmd_wait_list: List of SCSI commands that arrived before the RTU event. This
  *                 list contains struct srpt_ioctx elements and is protected
@@ -299,8 +264,6 @@ enum rdma_ch_state {
  * @release_done:  Enables waiting for srpt_release_channel() completion.
  */
 struct srpt_rdma_ch {
-       wait_queue_head_t       wait_queue;
-       struct task_struct      *thread;
        struct ib_cm_id         *cm_id;
        struct ib_qp            *qp;
        struct ib_cq            *cq;
@@ -317,7 +280,6 @@ struct srpt_rdma_ch {
        struct list_head        free_list;
        enum rdma_ch_state      state;
        struct srpt_send_ioctx  **ioctx_ring;
-       struct ib_wc            wc[16];
        struct list_head        list;
        struct list_head        cmd_wait_list;
        struct se_session       *sess;
@@ -377,8 +339,6 @@ struct srpt_port {
  * @mr:            L_Key (local key) with write access to all local memory.
  * @srq:           Per-HCA SRQ (shared receive queue).
  * @cm_id:         Connection identifier.
- * @dev_attr:      Attributes of the InfiniBand device as obtained during the
- *                 ib_client.add() callback.
  * @srq_size:      SRQ size.
  * @ioctx_ring:    Per-HCA SRQ.
  * @rch_list:      Per-device channel list -- see also srpt_rdma_ch.list.
@@ -393,7 +353,6 @@ struct srpt_device {
        struct ib_pd            *pd;
        struct ib_srq           *srq;
        struct ib_cm_id         *cm_id;
-       struct ib_device_attr   dev_attr;
        int                     srq_size;
        struct srpt_recv_ioctx  **ioctx_ring;
        struct list_head        rch_list;
index fd4100d56d8c5509986be23e40d2af137cd840b1..6727954ab74be9338e9c3d46e12a55fa3db7a6e7 100644 (file)
  */
 
 #include <linux/kernel.h>
+#include <linux/input.h>
+#include <linux/rcupdate.h>
 #include <linux/slab.h>
 #include <linux/stat.h>
 #include <linux/module.h>
 #include <linux/usb/input.h>
+#include <linux/usb/quirks.h>
 
 #define DRIVER_AUTHOR "Marko Friedemann <mfr@bmx-chemnitz.de>"
 #define DRIVER_DESC "X-Box pad driver"
@@ -125,7 +128,7 @@ static const struct xpad_device {
        { 0x045e, 0x0289, "Microsoft X-Box pad v2 (US)", 0, XTYPE_XBOX },
        { 0x045e, 0x028e, "Microsoft X-Box 360 pad", 0, XTYPE_XBOX360 },
        { 0x045e, 0x02d1, "Microsoft X-Box One pad", 0, XTYPE_XBOXONE },
-       { 0x045e, 0x02dd, "Microsoft X-Box One pad (Covert Forces)", 0, XTYPE_XBOXONE },
+       { 0x045e, 0x02dd, "Microsoft X-Box One pad (Firmware 2015)", 0, XTYPE_XBOXONE },
        { 0x045e, 0x0291, "Xbox 360 Wireless Receiver (XBOX)", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX360W },
        { 0x045e, 0x0719, "Xbox 360 Wireless Receiver", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX360W },
        { 0x044f, 0x0f07, "Thrustmaster, Inc. Controller", 0, XTYPE_XBOX },
@@ -317,21 +320,42 @@ static struct usb_device_id xpad_table[] = {
 
 MODULE_DEVICE_TABLE(usb, xpad_table);
 
+struct xpad_output_packet {
+       u8 data[XPAD_PKT_LEN];
+       u8 len;
+       bool pending;
+};
+
+#define XPAD_OUT_CMD_IDX       0
+#define XPAD_OUT_FF_IDX                1
+#define XPAD_OUT_LED_IDX       (1 + IS_ENABLED(CONFIG_JOYSTICK_XPAD_FF))
+#define XPAD_NUM_OUT_PACKETS   (1 + \
+                                IS_ENABLED(CONFIG_JOYSTICK_XPAD_FF) + \
+                                IS_ENABLED(CONFIG_JOYSTICK_XPAD_LEDS))
+
 struct usb_xpad {
        struct input_dev *dev;          /* input device interface */
+       struct input_dev __rcu *x360w_dev;
        struct usb_device *udev;        /* usb device */
        struct usb_interface *intf;     /* usb interface */
 
-       int pad_present;
+       bool pad_present;
+       bool input_created;
 
        struct urb *irq_in;             /* urb for interrupt in report */
        unsigned char *idata;           /* input data */
        dma_addr_t idata_dma;
 
        struct urb *irq_out;            /* urb for interrupt out report */
+       struct usb_anchor irq_out_anchor;
+       bool irq_out_active;            /* we must not use an active URB */
+       u8 odata_serial;                /* serial number for xbox one protocol */
        unsigned char *odata;           /* output data */
        dma_addr_t odata_dma;
-       struct mutex odata_mutex;
+       spinlock_t odata_lock;
+
+       struct xpad_output_packet out_packets[XPAD_NUM_OUT_PACKETS];
+       int last_out_packet;
 
 #if defined(CONFIG_JOYSTICK_XPAD_LEDS)
        struct xpad_led *led;
@@ -343,8 +367,12 @@ struct usb_xpad {
        int xtype;                      /* type of xbox device */
        int pad_nr;                     /* the order x360 pads were attached */
        const char *name;               /* name of the device */
+       struct work_struct work;        /* init/remove device from callback */
 };
 
+static int xpad_init_input(struct usb_xpad *xpad);
+static void xpad_deinit_input(struct usb_xpad *xpad);
+
 /*
  *     xpad_process_packet
  *
@@ -424,11 +452,9 @@ static void xpad_process_packet(struct usb_xpad *xpad, u16 cmd, unsigned char *d
  *             http://www.free60.org/wiki/Gamepad
  */
 
-static void xpad360_process_packet(struct usb_xpad *xpad,
+static void xpad360_process_packet(struct usb_xpad *xpad, struct input_dev *dev,
                                   u16 cmd, unsigned char *data)
 {
-       struct input_dev *dev = xpad->dev;
-
        /* digital pad */
        if (xpad->mapping & MAP_DPAD_TO_BUTTONS) {
                /* dpad as buttons (left, right, up, down) */
@@ -495,7 +521,30 @@ static void xpad360_process_packet(struct usb_xpad *xpad,
        input_sync(dev);
 }
 
-static void xpad_identify_controller(struct usb_xpad *xpad);
+static void xpad_presence_work(struct work_struct *work)
+{
+       struct usb_xpad *xpad = container_of(work, struct usb_xpad, work);
+       int error;
+
+       if (xpad->pad_present) {
+               error = xpad_init_input(xpad);
+               if (error) {
+                       /* complain only, not much else we can do here */
+                       dev_err(&xpad->dev->dev,
+                               "unable to init device: %d\n", error);
+               } else {
+                       rcu_assign_pointer(xpad->x360w_dev, xpad->dev);
+               }
+       } else {
+               RCU_INIT_POINTER(xpad->x360w_dev, NULL);
+               synchronize_rcu();
+               /*
+                * Now that we are sure xpad360w_process_packet is not
+                * using input device we can get rid of it.
+                */
+               xpad_deinit_input(xpad);
+       }
+}
 
 /*
  * xpad360w_process_packet
@@ -513,24 +562,28 @@ static void xpad_identify_controller(struct usb_xpad *xpad);
  */
 static void xpad360w_process_packet(struct usb_xpad *xpad, u16 cmd, unsigned char *data)
 {
+       struct input_dev *dev;
+       bool present;
+
        /* Presence change */
        if (data[0] & 0x08) {
-               if (data[1] & 0x80) {
-                       xpad->pad_present = 1;
-                       /*
-                        * Light up the segment corresponding to
-                        * controller number.
-                        */
-                       xpad_identify_controller(xpad);
-               } else
-                       xpad->pad_present = 0;
+               present = (data[1] & 0x80) != 0;
+
+               if (xpad->pad_present != present) {
+                       xpad->pad_present = present;
+                       schedule_work(&xpad->work);
+               }
        }
 
        /* Valid pad data */
-       if (!(data[1] & 0x1))
+       if (data[1] != 0x1)
                return;
 
-       xpad360_process_packet(xpad, cmd, &data[4]);
+       rcu_read_lock();
+       dev = rcu_dereference(xpad->x360w_dev);
+       if (dev)
+               xpad360_process_packet(xpad, dev, cmd, &data[4]);
+       rcu_read_unlock();
 }
 
 /*
@@ -659,7 +712,7 @@ static void xpad_irq_in(struct urb *urb)
 
        switch (xpad->xtype) {
        case XTYPE_XBOX360:
-               xpad360_process_packet(xpad, 0, xpad->idata);
+               xpad360_process_packet(xpad, xpad->dev, 0, xpad->idata);
                break;
        case XTYPE_XBOX360W:
                xpad360w_process_packet(xpad, 0, xpad->idata);
@@ -678,18 +731,73 @@ exit:
                        __func__, retval);
 }
 
+/* Callers must hold xpad->odata_lock spinlock */
+static bool xpad_prepare_next_out_packet(struct usb_xpad *xpad)
+{
+       struct xpad_output_packet *pkt, *packet = NULL;
+       int i;
+
+       for (i = 0; i < XPAD_NUM_OUT_PACKETS; i++) {
+               if (++xpad->last_out_packet >= XPAD_NUM_OUT_PACKETS)
+                       xpad->last_out_packet = 0;
+
+               pkt = &xpad->out_packets[xpad->last_out_packet];
+               if (pkt->pending) {
+                       dev_dbg(&xpad->intf->dev,
+                               "%s - found pending output packet %d\n",
+                               __func__, xpad->last_out_packet);
+                       packet = pkt;
+                       break;
+               }
+       }
+
+       if (packet) {
+               memcpy(xpad->odata, packet->data, packet->len);
+               xpad->irq_out->transfer_buffer_length = packet->len;
+               return true;
+       }
+
+       return false;
+}
+
+/* Callers must hold xpad->odata_lock spinlock */
+static int xpad_try_sending_next_out_packet(struct usb_xpad *xpad)
+{
+       int error;
+
+       if (!xpad->irq_out_active && xpad_prepare_next_out_packet(xpad)) {
+               usb_anchor_urb(xpad->irq_out, &xpad->irq_out_anchor);
+               error = usb_submit_urb(xpad->irq_out, GFP_ATOMIC);
+               if (error) {
+                       dev_err(&xpad->intf->dev,
+                               "%s - usb_submit_urb failed with result %d\n",
+                               __func__, error);
+                       usb_unanchor_urb(xpad->irq_out);
+                       return -EIO;
+               }
+
+               xpad->irq_out_active = true;
+       }
+
+       return 0;
+}
+
 static void xpad_irq_out(struct urb *urb)
 {
        struct usb_xpad *xpad = urb->context;
        struct device *dev = &xpad->intf->dev;
-       int retval, status;
+       int status = urb->status;
+       int error;
+       unsigned long flags;
 
-       status = urb->status;
+       spin_lock_irqsave(&xpad->odata_lock, flags);
 
        switch (status) {
        case 0:
                /* success */
-               return;
+               xpad->out_packets[xpad->last_out_packet].pending = false;
+               xpad->irq_out_active = xpad_prepare_next_out_packet(xpad);
+               break;
 
        case -ECONNRESET:
        case -ENOENT:
@@ -697,19 +805,28 @@ static void xpad_irq_out(struct urb *urb)
                /* this urb is terminated, clean up */
                dev_dbg(dev, "%s - urb shutting down with status: %d\n",
                        __func__, status);
-               return;
+               xpad->irq_out_active = false;
+               break;
 
        default:
                dev_dbg(dev, "%s - nonzero urb status received: %d\n",
                        __func__, status);
-               goto exit;
+               break;
        }
 
-exit:
-       retval = usb_submit_urb(urb, GFP_ATOMIC);
-       if (retval)
-               dev_err(dev, "%s - usb_submit_urb failed with result %d\n",
-                       __func__, retval);
+       if (xpad->irq_out_active) {
+               usb_anchor_urb(urb, &xpad->irq_out_anchor);
+               error = usb_submit_urb(urb, GFP_ATOMIC);
+               if (error) {
+                       dev_err(dev,
+                               "%s - usb_submit_urb failed with result %d\n",
+                               __func__, error);
+                       usb_unanchor_urb(urb);
+                       xpad->irq_out_active = false;
+               }
+       }
+
+       spin_unlock_irqrestore(&xpad->odata_lock, flags);
 }
 
 static int xpad_init_output(struct usb_interface *intf, struct usb_xpad *xpad)
@@ -721,6 +838,8 @@ static int xpad_init_output(struct usb_interface *intf, struct usb_xpad *xpad)
        if (xpad->xtype == XTYPE_UNKNOWN)
                return 0;
 
+       init_usb_anchor(&xpad->irq_out_anchor);
+
        xpad->odata = usb_alloc_coherent(xpad->udev, XPAD_PKT_LEN,
                                         GFP_KERNEL, &xpad->odata_dma);
        if (!xpad->odata) {
@@ -728,7 +847,7 @@ static int xpad_init_output(struct usb_interface *intf, struct usb_xpad *xpad)
                goto fail1;
        }
 
-       mutex_init(&xpad->odata_mutex);
+       spin_lock_init(&xpad->odata_lock);
 
        xpad->irq_out = usb_alloc_urb(0, GFP_KERNEL);
        if (!xpad->irq_out) {
@@ -755,8 +874,14 @@ static int xpad_init_output(struct usb_interface *intf, struct usb_xpad *xpad)
 
 static void xpad_stop_output(struct usb_xpad *xpad)
 {
-       if (xpad->xtype != XTYPE_UNKNOWN)
-               usb_kill_urb(xpad->irq_out);
+       if (xpad->xtype != XTYPE_UNKNOWN) {
+               if (!usb_wait_anchor_empty_timeout(&xpad->irq_out_anchor,
+                                                  5000)) {
+                       dev_warn(&xpad->intf->dev,
+                                "timed out waiting for output URB to complete, killing\n");
+                       usb_kill_anchored_urbs(&xpad->irq_out_anchor);
+               }
+       }
 }
 
 static void xpad_deinit_output(struct usb_xpad *xpad)
@@ -770,27 +895,60 @@ static void xpad_deinit_output(struct usb_xpad *xpad)
 
 static int xpad_inquiry_pad_presence(struct usb_xpad *xpad)
 {
+       struct xpad_output_packet *packet =
+                       &xpad->out_packets[XPAD_OUT_CMD_IDX];
+       unsigned long flags;
        int retval;
 
-       mutex_lock(&xpad->odata_mutex);
+       spin_lock_irqsave(&xpad->odata_lock, flags);
+
+       packet->data[0] = 0x08;
+       packet->data[1] = 0x00;
+       packet->data[2] = 0x0F;
+       packet->data[3] = 0xC0;
+       packet->data[4] = 0x00;
+       packet->data[5] = 0x00;
+       packet->data[6] = 0x00;
+       packet->data[7] = 0x00;
+       packet->data[8] = 0x00;
+       packet->data[9] = 0x00;
+       packet->data[10] = 0x00;
+       packet->data[11] = 0x00;
+       packet->len = 12;
+       packet->pending = true;
+
+       /* Reset the sequence so we send out presence first */
+       xpad->last_out_packet = -1;
+       retval = xpad_try_sending_next_out_packet(xpad);
+
+       spin_unlock_irqrestore(&xpad->odata_lock, flags);
 
-       xpad->odata[0] = 0x08;
-       xpad->odata[1] = 0x00;
-       xpad->odata[2] = 0x0F;
-       xpad->odata[3] = 0xC0;
-       xpad->odata[4] = 0x00;
-       xpad->odata[5] = 0x00;
-       xpad->odata[6] = 0x00;
-       xpad->odata[7] = 0x00;
-       xpad->odata[8] = 0x00;
-       xpad->odata[9] = 0x00;
-       xpad->odata[10] = 0x00;
-       xpad->odata[11] = 0x00;
-       xpad->irq_out->transfer_buffer_length = 12;
+       return retval;
+}
+
+static int xpad_start_xbox_one(struct usb_xpad *xpad)
+{
+       struct xpad_output_packet *packet =
+                       &xpad->out_packets[XPAD_OUT_CMD_IDX];
+       unsigned long flags;
+       int retval;
 
-       retval = usb_submit_urb(xpad->irq_out, GFP_KERNEL);
+       spin_lock_irqsave(&xpad->odata_lock, flags);
 
-       mutex_unlock(&xpad->odata_mutex);
+       /* Xbox one controller needs to be initialized. */
+       packet->data[0] = 0x05;
+       packet->data[1] = 0x20;
+       packet->data[2] = xpad->odata_serial++; /* packet serial */
+       packet->data[3] = 0x01; /* rumble bit enable?  */
+       packet->data[4] = 0x00;
+       packet->len = 5;
+       packet->pending = true;
+
+       /* Reset the sequence so we send out start packet first */
+       xpad->last_out_packet = -1;
+       retval = xpad_try_sending_next_out_packet(xpad);
+
+       spin_unlock_irqrestore(&xpad->odata_lock, flags);
 
        return retval;
 }
@@ -799,8 +957,11 @@ static int xpad_inquiry_pad_presence(struct usb_xpad *xpad)
 static int xpad_play_effect(struct input_dev *dev, void *data, struct ff_effect *effect)
 {
        struct usb_xpad *xpad = input_get_drvdata(dev);
+       struct xpad_output_packet *packet = &xpad->out_packets[XPAD_OUT_FF_IDX];
        __u16 strong;
        __u16 weak;
+       int retval;
+       unsigned long flags;
 
        if (effect->type != FF_RUMBLE)
                return 0;
@@ -808,69 +969,81 @@ static int xpad_play_effect(struct input_dev *dev, void *data, struct ff_effect
        strong = effect->u.rumble.strong_magnitude;
        weak = effect->u.rumble.weak_magnitude;
 
+       spin_lock_irqsave(&xpad->odata_lock, flags);
+
        switch (xpad->xtype) {
        case XTYPE_XBOX:
-               xpad->odata[0] = 0x00;
-               xpad->odata[1] = 0x06;
-               xpad->odata[2] = 0x00;
-               xpad->odata[3] = strong / 256;  /* left actuator */
-               xpad->odata[4] = 0x00;
-               xpad->odata[5] = weak / 256;    /* right actuator */
-               xpad->irq_out->transfer_buffer_length = 6;
+               packet->data[0] = 0x00;
+               packet->data[1] = 0x06;
+               packet->data[2] = 0x00;
+               packet->data[3] = strong / 256; /* left actuator */
+               packet->data[4] = 0x00;
+               packet->data[5] = weak / 256;   /* right actuator */
+               packet->len = 6;
+               packet->pending = true;
                break;
 
        case XTYPE_XBOX360:
-               xpad->odata[0] = 0x00;
-               xpad->odata[1] = 0x08;
-               xpad->odata[2] = 0x00;
-               xpad->odata[3] = strong / 256;  /* left actuator? */
-               xpad->odata[4] = weak / 256;    /* right actuator? */
-               xpad->odata[5] = 0x00;
-               xpad->odata[6] = 0x00;
-               xpad->odata[7] = 0x00;
-               xpad->irq_out->transfer_buffer_length = 8;
+               packet->data[0] = 0x00;
+               packet->data[1] = 0x08;
+               packet->data[2] = 0x00;
+               packet->data[3] = strong / 256;  /* left actuator? */
+               packet->data[4] = weak / 256;   /* right actuator? */
+               packet->data[5] = 0x00;
+               packet->data[6] = 0x00;
+               packet->data[7] = 0x00;
+               packet->len = 8;
+               packet->pending = true;
                break;
 
        case XTYPE_XBOX360W:
-               xpad->odata[0] = 0x00;
-               xpad->odata[1] = 0x01;
-               xpad->odata[2] = 0x0F;
-               xpad->odata[3] = 0xC0;
-               xpad->odata[4] = 0x00;
-               xpad->odata[5] = strong / 256;
-               xpad->odata[6] = weak / 256;
-               xpad->odata[7] = 0x00;
-               xpad->odata[8] = 0x00;
-               xpad->odata[9] = 0x00;
-               xpad->odata[10] = 0x00;
-               xpad->odata[11] = 0x00;
-               xpad->irq_out->transfer_buffer_length = 12;
+               packet->data[0] = 0x00;
+               packet->data[1] = 0x01;
+               packet->data[2] = 0x0F;
+               packet->data[3] = 0xC0;
+               packet->data[4] = 0x00;
+               packet->data[5] = strong / 256;
+               packet->data[6] = weak / 256;
+               packet->data[7] = 0x00;
+               packet->data[8] = 0x00;
+               packet->data[9] = 0x00;
+               packet->data[10] = 0x00;
+               packet->data[11] = 0x00;
+               packet->len = 12;
+               packet->pending = true;
                break;
 
        case XTYPE_XBOXONE:
-               xpad->odata[0] = 0x09; /* activate rumble */
-               xpad->odata[1] = 0x08;
-               xpad->odata[2] = 0x00;
-               xpad->odata[3] = 0x08; /* continuous effect */
-               xpad->odata[4] = 0x00; /* simple rumble mode */
-               xpad->odata[5] = 0x03; /* L and R actuator only */
-               xpad->odata[6] = 0x00; /* TODO: LT actuator */
-               xpad->odata[7] = 0x00; /* TODO: RT actuator */
-               xpad->odata[8] = strong / 256;  /* left actuator */
-               xpad->odata[9] = weak / 256;    /* right actuator */
-               xpad->odata[10] = 0x80; /* length of pulse */
-               xpad->odata[11] = 0x00; /* stop period of pulse */
-               xpad->irq_out->transfer_buffer_length = 12;
+               packet->data[0] = 0x09; /* activate rumble */
+               packet->data[1] = 0x08;
+               packet->data[2] = xpad->odata_serial++;
+               packet->data[3] = 0x08; /* continuous effect */
+               packet->data[4] = 0x00; /* simple rumble mode */
+               packet->data[5] = 0x03; /* L and R actuator only */
+               packet->data[6] = 0x00; /* TODO: LT actuator */
+               packet->data[7] = 0x00; /* TODO: RT actuator */
+               packet->data[8] = strong / 512; /* left actuator */
+               packet->data[9] = weak / 512;   /* right actuator */
+               packet->data[10] = 0x80;        /* length of pulse */
+               packet->data[11] = 0x00;        /* stop period of pulse */
+               packet->data[12] = 0x00;
+               packet->len = 13;
+               packet->pending = true;
                break;
 
        default:
                dev_dbg(&xpad->dev->dev,
                        "%s - rumble command sent to unsupported xpad type: %d\n",
                        __func__, xpad->xtype);
-               return -EINVAL;
+               retval = -EINVAL;
+               goto out;
        }
 
-       return usb_submit_urb(xpad->irq_out, GFP_ATOMIC);
+       retval = xpad_try_sending_next_out_packet(xpad);
+
+out:
+       spin_unlock_irqrestore(&xpad->odata_lock, flags);
+       return retval;
 }
 
 static int xpad_init_ff(struct usb_xpad *xpad)
@@ -921,36 +1094,44 @@ struct xpad_led {
  */
 static void xpad_send_led_command(struct usb_xpad *xpad, int command)
 {
+       struct xpad_output_packet *packet =
+                       &xpad->out_packets[XPAD_OUT_LED_IDX];
+       unsigned long flags;
+
        command %= 16;
 
-       mutex_lock(&xpad->odata_mutex);
+       spin_lock_irqsave(&xpad->odata_lock, flags);
 
        switch (xpad->xtype) {
        case XTYPE_XBOX360:
-               xpad->odata[0] = 0x01;
-               xpad->odata[1] = 0x03;
-               xpad->odata[2] = command;
-               xpad->irq_out->transfer_buffer_length = 3;
+               packet->data[0] = 0x01;
+               packet->data[1] = 0x03;
+               packet->data[2] = command;
+               packet->len = 3;
+               packet->pending = true;
                break;
+
        case XTYPE_XBOX360W:
-               xpad->odata[0] = 0x00;
-               xpad->odata[1] = 0x00;
-               xpad->odata[2] = 0x08;
-               xpad->odata[3] = 0x40 + command;
-               xpad->odata[4] = 0x00;
-               xpad->odata[5] = 0x00;
-               xpad->odata[6] = 0x00;
-               xpad->odata[7] = 0x00;
-               xpad->odata[8] = 0x00;
-               xpad->odata[9] = 0x00;
-               xpad->odata[10] = 0x00;
-               xpad->odata[11] = 0x00;
-               xpad->irq_out->transfer_buffer_length = 12;
+               packet->data[0] = 0x00;
+               packet->data[1] = 0x00;
+               packet->data[2] = 0x08;
+               packet->data[3] = 0x40 + command;
+               packet->data[4] = 0x00;
+               packet->data[5] = 0x00;
+               packet->data[6] = 0x00;
+               packet->data[7] = 0x00;
+               packet->data[8] = 0x00;
+               packet->data[9] = 0x00;
+               packet->data[10] = 0x00;
+               packet->data[11] = 0x00;
+               packet->len = 12;
+               packet->pending = true;
                break;
        }
 
-       usb_submit_urb(xpad->irq_out, GFP_KERNEL);
-       mutex_unlock(&xpad->odata_mutex);
+       xpad_try_sending_next_out_packet(xpad);
+
+       spin_unlock_irqrestore(&xpad->odata_lock, flags);
 }
 
 /*
@@ -959,7 +1140,7 @@ static void xpad_send_led_command(struct usb_xpad *xpad, int command)
  */
 static void xpad_identify_controller(struct usb_xpad *xpad)
 {
-       xpad_send_led_command(xpad, (xpad->pad_nr % 4) + 2);
+       led_set_brightness(&xpad->led->led_cdev, (xpad->pad_nr % 4) + 2);
 }
 
 static void xpad_led_set(struct led_classdev *led_cdev,
@@ -1001,14 +1182,7 @@ static int xpad_led_probe(struct usb_xpad *xpad)
        if (error)
                goto err_free_id;
 
-       if (xpad->xtype == XTYPE_XBOX360) {
-               /*
-                * Light up the segment corresponding to controller
-                * number on wired devices. On wireless we'll do that
-                * when they respond to "presence" packet.
-                */
-               xpad_identify_controller(xpad);
-       }
+       xpad_identify_controller(xpad);
 
        return 0;
 
@@ -1036,37 +1210,73 @@ static void xpad_led_disconnect(struct usb_xpad *xpad) { }
 static void xpad_identify_controller(struct usb_xpad *xpad) { }
 #endif
 
-static int xpad_open(struct input_dev *dev)
+static int xpad_start_input(struct usb_xpad *xpad)
 {
-       struct usb_xpad *xpad = input_get_drvdata(dev);
-
-       /* URB was submitted in probe */
-       if (xpad->xtype == XTYPE_XBOX360W)
-               return 0;
+       int error;
 
-       xpad->irq_in->dev = xpad->udev;
        if (usb_submit_urb(xpad->irq_in, GFP_KERNEL))
                return -EIO;
 
        if (xpad->xtype == XTYPE_XBOXONE) {
-               /* Xbox one controller needs to be initialized. */
-               xpad->odata[0] = 0x05;
-               xpad->odata[1] = 0x20;
-               xpad->irq_out->transfer_buffer_length = 2;
-               return usb_submit_urb(xpad->irq_out, GFP_KERNEL);
+               error = xpad_start_xbox_one(xpad);
+               if (error) {
+                       usb_kill_urb(xpad->irq_in);
+                       return error;
+               }
        }
 
        return 0;
 }
 
-static void xpad_close(struct input_dev *dev)
+static void xpad_stop_input(struct usb_xpad *xpad)
 {
-       struct usb_xpad *xpad = input_get_drvdata(dev);
+       usb_kill_urb(xpad->irq_in);
+}
+
+static int xpad360w_start_input(struct usb_xpad *xpad)
+{
+       int error;
 
-       if (xpad->xtype != XTYPE_XBOX360W)
+       error = usb_submit_urb(xpad->irq_in, GFP_KERNEL);
+       if (error)
+               return -EIO;
+
+       /*
+        * Send presence packet.
+        * This will force the controller to resend connection packets.
+        * This is useful in the case we activate the module after the
+        * adapter has been plugged in, as it won't automatically
+        * send us info about the controllers.
+        */
+       error = xpad_inquiry_pad_presence(xpad);
+       if (error) {
                usb_kill_urb(xpad->irq_in);
+               return error;
+       }
 
-       xpad_stop_output(xpad);
+       return 0;
+}
+
+static void xpad360w_stop_input(struct usb_xpad *xpad)
+{
+       usb_kill_urb(xpad->irq_in);
+
+       /* Make sure we are done with presence work if it was scheduled */
+       flush_work(&xpad->work);
+}
+
+static int xpad_open(struct input_dev *dev)
+{
+       struct usb_xpad *xpad = input_get_drvdata(dev);
+
+       return xpad_start_input(xpad);
+}
+
+static void xpad_close(struct input_dev *dev)
+{
+       struct usb_xpad *xpad = input_get_drvdata(dev);
+
+       xpad_stop_input(xpad);
 }
 
 static void xpad_set_up_abs(struct input_dev *input_dev, signed short abs)
@@ -1097,8 +1307,11 @@ static void xpad_set_up_abs(struct input_dev *input_dev, signed short abs)
 
 static void xpad_deinit_input(struct usb_xpad *xpad)
 {
-       xpad_led_disconnect(xpad);
-       input_unregister_device(xpad->dev);
+       if (xpad->input_created) {
+               xpad->input_created = false;
+               xpad_led_disconnect(xpad);
+               input_unregister_device(xpad->dev);
+       }
 }
 
 static int xpad_init_input(struct usb_xpad *xpad)
@@ -1118,8 +1331,10 @@ static int xpad_init_input(struct usb_xpad *xpad)
 
        input_set_drvdata(input_dev, xpad);
 
-       input_dev->open = xpad_open;
-       input_dev->close = xpad_close;
+       if (xpad->xtype != XTYPE_XBOX360W) {
+               input_dev->open = xpad_open;
+               input_dev->close = xpad_close;
+       }
 
        __set_bit(EV_KEY, input_dev->evbit);
 
@@ -1181,6 +1396,7 @@ static int xpad_init_input(struct usb_xpad *xpad)
        if (error)
                goto err_disconnect_led;
 
+       xpad->input_created = true;
        return 0;
 
 err_disconnect_led:
@@ -1241,6 +1457,7 @@ static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id
        xpad->mapping = xpad_device[i].mapping;
        xpad->xtype = xpad_device[i].xtype;
        xpad->name = xpad_device[i].name;
+       INIT_WORK(&xpad->work, xpad_presence_work);
 
        if (xpad->xtype == XTYPE_UNKNOWN) {
                if (intf->cur_altsetting->desc.bInterfaceClass == USB_CLASS_VENDOR_SPEC) {
@@ -1277,10 +1494,6 @@ static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id
 
        usb_set_intfdata(intf, xpad);
 
-       error = xpad_init_input(xpad);
-       if (error)
-               goto err_deinit_output;
-
        if (xpad->xtype == XTYPE_XBOX360W) {
                /*
                 * Submit the int URB immediately rather than waiting for open
@@ -1289,28 +1502,24 @@ static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id
                 * exactly the message that a controller has arrived that
                 * we're waiting for.
                 */
-               xpad->irq_in->dev = xpad->udev;
-               error = usb_submit_urb(xpad->irq_in, GFP_KERNEL);
+               error = xpad360w_start_input(xpad);
                if (error)
-                       goto err_deinit_input;
-
+                       goto err_deinit_output;
                /*
-                * Send presence packet.
-                * This will force the controller to resend connection packets.
-                * This is useful in the case we activate the module after the
-                * adapter has been plugged in, as it won't automatically
-                * send us info about the controllers.
+                * Wireless controllers require RESET_RESUME to work properly
+                * after suspend. Ideally this quirk should be in usb core
+                * quirk list, but we have too many vendors producing these
+                * controllers and we'd need to maintain 2 identical lists
+                * here in this driver and in usb core.
                 */
-               error = xpad_inquiry_pad_presence(xpad);
+               udev->quirks |= USB_QUIRK_RESET_RESUME;
+       } else {
+               error = xpad_init_input(xpad);
                if (error)
-                       goto err_kill_in_urb;
+                       goto err_deinit_output;
        }
        return 0;
 
-err_kill_in_urb:
-       usb_kill_urb(xpad->irq_in);
-err_deinit_input:
-       xpad_deinit_input(xpad);
 err_deinit_output:
        xpad_deinit_output(xpad);
 err_free_in_urb:
@@ -1320,19 +1529,24 @@ err_free_idata:
 err_free_mem:
        kfree(xpad);
        return error;
-
 }
 
 static void xpad_disconnect(struct usb_interface *intf)
 {
-       struct usb_xpad *xpad = usb_get_intfdata (intf);
+       struct usb_xpad *xpad = usb_get_intfdata(intf);
+
+       if (xpad->xtype == XTYPE_XBOX360W)
+               xpad360w_stop_input(xpad);
 
        xpad_deinit_input(xpad);
-       xpad_deinit_output(xpad);
 
-       if (xpad->xtype == XTYPE_XBOX360W) {
-               usb_kill_urb(xpad->irq_in);
-       }
+       /*
+        * Now that both input device and LED device are gone we can
+        * stop output URB.
+        */
+       xpad_stop_output(xpad);
+
+       xpad_deinit_output(xpad);
 
        usb_free_urb(xpad->irq_in);
        usb_free_coherent(xpad->udev, XPAD_PKT_LEN,
@@ -1343,10 +1557,55 @@ static void xpad_disconnect(struct usb_interface *intf)
        usb_set_intfdata(intf, NULL);
 }
 
+static int xpad_suspend(struct usb_interface *intf, pm_message_t message)
+{
+       struct usb_xpad *xpad = usb_get_intfdata(intf);
+       struct input_dev *input = xpad->dev;
+
+       if (xpad->xtype == XTYPE_XBOX360W) {
+               /*
+                * Wireless controllers always listen to input so
+                * they are notified when controller shows up
+                * or goes away.
+                */
+               xpad360w_stop_input(xpad);
+       } else {
+               mutex_lock(&input->mutex);
+               if (input->users)
+                       xpad_stop_input(xpad);
+               mutex_unlock(&input->mutex);
+       }
+
+       xpad_stop_output(xpad);
+
+       return 0;
+}
+
+static int xpad_resume(struct usb_interface *intf)
+{
+       struct usb_xpad *xpad = usb_get_intfdata(intf);
+       struct input_dev *input = xpad->dev;
+       int retval = 0;
+
+       if (xpad->xtype == XTYPE_XBOX360W) {
+               retval = xpad360w_start_input(xpad);
+       } else {
+               mutex_lock(&input->mutex);
+               if (input->users)
+                       retval = xpad_start_input(xpad);
+               mutex_unlock(&input->mutex);
+       }
+
+       return retval;
+}
+
 static struct usb_driver xpad_driver = {
        .name           = "xpad",
        .probe          = xpad_probe,
        .disconnect     = xpad_disconnect,
+       .suspend        = xpad_suspend,
+       .resume         = xpad_resume,
+       .reset_resume   = xpad_resume,
        .id_table       = xpad_table,
 };
 
index b9f01bd1b7ef1b76a7b4c0d46f71fc3e4ace1818..29093657f2ef8233c667aace8406790a7d859f1d 100644 (file)
@@ -630,7 +630,7 @@ gpio_keys_get_devtree_pdata(struct device *dev)
        if (!node)
                return ERR_PTR(-ENODEV);
 
-       nbuttons = of_get_child_count(node);
+       nbuttons = of_get_available_child_count(node);
        if (nbuttons == 0)
                return ERR_PTR(-ENODEV);
 
@@ -645,8 +645,10 @@ gpio_keys_get_devtree_pdata(struct device *dev)
 
        pdata->rep = !!of_get_property(node, "autorepeat", NULL);
 
+       of_property_read_string(node, "label", &pdata->name);
+
        i = 0;
-       for_each_child_of_node(node, pp) {
+       for_each_available_child_of_node(node, pp) {
                enum of_gpio_flags flags;
 
                button = &pdata->buttons[i++];
index 2d5794ec338b72a1cea0367bccdaf9654db025d4..2160512e861af57d289fb5873ef80b81c2fbf7c9 100644 (file)
@@ -113,8 +113,8 @@ struct t7_config {
 #define MXT_T9_DETECT          (1 << 7)
 
 struct t9_range {
-       u16 x;
-       u16 y;
+       __le16 x;
+       __le16 y;
 } __packed;
 
 /* MXT_TOUCH_MULTI_T9 orient */
@@ -216,6 +216,7 @@ struct mxt_data {
        unsigned int irq;
        unsigned int max_x;
        unsigned int max_y;
+       bool xy_switch;
        bool in_bootloader;
        u16 mem_size;
        u8 t100_aux_ampl;
@@ -1665,8 +1666,8 @@ static int mxt_read_t9_resolution(struct mxt_data *data)
        if (error)
                return error;
 
-       le16_to_cpus(&range.x);
-       le16_to_cpus(&range.y);
+       data->max_x = get_unaligned_le16(&range.x);
+       data->max_y = get_unaligned_le16(&range.y);
 
        error =  __mxt_read_reg(client,
                                object->start_address + MXT_T9_ORIENT,
@@ -1674,23 +1675,7 @@ static int mxt_read_t9_resolution(struct mxt_data *data)
        if (error)
                return error;
 
-       /* Handle default values */
-       if (range.x == 0)
-               range.x = 1023;
-
-       if (range.y == 0)
-               range.y = 1023;
-
-       if (orient & MXT_T9_ORIENT_SWITCH) {
-               data->max_x = range.y;
-               data->max_y = range.x;
-       } else {
-               data->max_x = range.x;
-               data->max_y = range.y;
-       }
-
-       dev_dbg(&client->dev,
-               "Touchscreen size X%uY%u\n", data->max_x, data->max_y);
+       data->xy_switch = orient & MXT_T9_ORIENT_SWITCH;
 
        return 0;
 }
@@ -1708,13 +1693,14 @@ static int mxt_read_t100_config(struct mxt_data *data)
        if (!object)
                return -EINVAL;
 
+       /* read touchscreen dimensions */
        error = __mxt_read_reg(client,
                               object->start_address + MXT_T100_XRANGE,
                               sizeof(range_x), &range_x);
        if (error)
                return error;
 
-       le16_to_cpus(&range_x);
+       data->max_x = get_unaligned_le16(&range_x);
 
        error = __mxt_read_reg(client,
                               object->start_address + MXT_T100_YRANGE,
@@ -1722,36 +1708,24 @@ static int mxt_read_t100_config(struct mxt_data *data)
        if (error)
                return error;
 
-       le16_to_cpus(&range_y);
+       data->max_y = get_unaligned_le16(&range_y);
 
+       /* read orientation config */
        error =  __mxt_read_reg(client,
                                object->start_address + MXT_T100_CFG1,
                                1, &cfg);
        if (error)
                return error;
 
+       data->xy_switch = cfg & MXT_T100_CFG_SWITCHXY;
+
+       /* allocate aux bytes */
        error =  __mxt_read_reg(client,
                                object->start_address + MXT_T100_TCHAUX,
                                1, &tchaux);
        if (error)
                return error;
 
-       /* Handle default values */
-       if (range_x == 0)
-               range_x = 1023;
-
-       if (range_y == 0)
-               range_y = 1023;
-
-       if (cfg & MXT_T100_CFG_SWITCHXY) {
-               data->max_x = range_y;
-               data->max_y = range_x;
-       } else {
-               data->max_x = range_x;
-               data->max_y = range_y;
-       }
-
-       /* allocate aux bytes */
        aux = 6;
 
        if (tchaux & MXT_T100_TCHAUX_VECT)
@@ -1767,9 +1741,6 @@ static int mxt_read_t100_config(struct mxt_data *data)
                "T100 aux mappings vect:%u ampl:%u area:%u\n",
                data->t100_aux_vect, data->t100_aux_ampl, data->t100_aux_area);
 
-       dev_info(&client->dev,
-                "T100 Touchscreen size X%uY%u\n", data->max_x, data->max_y);
-
        return 0;
 }
 
@@ -1828,6 +1799,19 @@ static int mxt_initialize_input_device(struct mxt_data *data)
                return -EINVAL;
        }
 
+       /* Handle default values and orientation switch */
+       if (data->max_x == 0)
+               data->max_x = 1023;
+
+       if (data->max_y == 0)
+               data->max_y = 1023;
+
+       if (data->xy_switch)
+               swap(data->max_x, data->max_y);
+
+       dev_info(dev, "Touchscreen size X%uY%u\n", data->max_x, data->max_y);
+
+       /* Register input device */
        input_dev = input_allocate_device();
        if (!input_dev) {
                dev_err(dev, "Failed to allocate memory\n");
index 7e0f42acb7376cda1378c22f9712d7f006c39af2..a7a0a22cf1a59661b22f9997589eab9e73a13ec7 100644 (file)
@@ -2,6 +2,6 @@
 # Makefile for Open-Channel SSDs.
 #
 
-obj-$(CONFIG_NVM)              := core.o
+obj-$(CONFIG_NVM)              := core.o sysblk.o
 obj-$(CONFIG_NVM_GENNVM)       += gennvm.o
 obj-$(CONFIG_NVM_RRPC)         += rrpc.o
index 8f41b245cd55b55862911baaf2220d885fc30142..33224cb91c5bb98ac7e464b4202dacbac71671c9 100644 (file)
@@ -27,6 +27,7 @@
 #include <linux/module.h>
 #include <linux/miscdevice.h>
 #include <linux/lightnvm.h>
+#include <linux/sched/sysctl.h>
 #include <uapi/linux/lightnvm.h>
 
 static LIST_HEAD(nvm_targets);
@@ -105,6 +106,9 @@ struct nvmm_type *nvm_init_mgr(struct nvm_dev *dev)
        lockdep_assert_held(&nvm_lock);
 
        list_for_each_entry(mt, &nvm_mgrs, list) {
+               if (strncmp(dev->sb.mmtype, mt->name, NVM_MMTYPE_LEN))
+                       continue;
+
                ret = mt->register_mgr(dev);
                if (ret < 0) {
                        pr_err("nvm: media mgr failed to init (%d) on dev %s\n",
@@ -166,6 +170,20 @@ static struct nvm_dev *nvm_find_nvm_dev(const char *name)
        return NULL;
 }
 
+struct nvm_block *nvm_get_blk_unlocked(struct nvm_dev *dev, struct nvm_lun *lun,
+                                                       unsigned long flags)
+{
+       return dev->mt->get_blk_unlocked(dev, lun, flags);
+}
+EXPORT_SYMBOL(nvm_get_blk_unlocked);
+
+/* Assumes that all valid pages have already been moved on release to bm */
+void nvm_put_blk_unlocked(struct nvm_dev *dev, struct nvm_block *blk)
+{
+       return dev->mt->put_blk_unlocked(dev, blk);
+}
+EXPORT_SYMBOL(nvm_put_blk_unlocked);
+
 struct nvm_block *nvm_get_blk(struct nvm_dev *dev, struct nvm_lun *lun,
                                                        unsigned long flags)
 {
@@ -192,6 +210,206 @@ int nvm_erase_blk(struct nvm_dev *dev, struct nvm_block *blk)
 }
 EXPORT_SYMBOL(nvm_erase_blk);
 
+void nvm_addr_to_generic_mode(struct nvm_dev *dev, struct nvm_rq *rqd)
+{
+       int i;
+
+       if (rqd->nr_pages > 1) {
+               for (i = 0; i < rqd->nr_pages; i++)
+                       rqd->ppa_list[i] = dev_to_generic_addr(dev,
+                                                       rqd->ppa_list[i]);
+       } else {
+               rqd->ppa_addr = dev_to_generic_addr(dev, rqd->ppa_addr);
+       }
+}
+EXPORT_SYMBOL(nvm_addr_to_generic_mode);
+
+void nvm_generic_to_addr_mode(struct nvm_dev *dev, struct nvm_rq *rqd)
+{
+       int i;
+
+       if (rqd->nr_pages > 1) {
+               for (i = 0; i < rqd->nr_pages; i++)
+                       rqd->ppa_list[i] = generic_to_dev_addr(dev,
+                                                       rqd->ppa_list[i]);
+       } else {
+               rqd->ppa_addr = generic_to_dev_addr(dev, rqd->ppa_addr);
+       }
+}
+EXPORT_SYMBOL(nvm_generic_to_addr_mode);
+
+int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd,
+                                       struct ppa_addr *ppas, int nr_ppas)
+{
+       int i, plane_cnt, pl_idx;
+
+       if (dev->plane_mode == NVM_PLANE_SINGLE && nr_ppas == 1) {
+               rqd->nr_pages = 1;
+               rqd->ppa_addr = ppas[0];
+
+               return 0;
+       }
+
+       plane_cnt = (1 << dev->plane_mode);
+       rqd->nr_pages = plane_cnt * nr_ppas;
+
+       if (dev->ops->max_phys_sect < rqd->nr_pages)
+               return -EINVAL;
+
+       rqd->ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL, &rqd->dma_ppa_list);
+       if (!rqd->ppa_list) {
+               pr_err("nvm: failed to allocate dma memory\n");
+               return -ENOMEM;
+       }
+
+       for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) {
+               for (i = 0; i < nr_ppas; i++) {
+                       ppas[i].g.pl = pl_idx;
+                       rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppas[i];
+               }
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(nvm_set_rqd_ppalist);
+
+void nvm_free_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd)
+{
+       if (!rqd->ppa_list)
+               return;
+
+       nvm_dev_dma_free(dev, rqd->ppa_list, rqd->dma_ppa_list);
+}
+EXPORT_SYMBOL(nvm_free_rqd_ppalist);
+
+int nvm_erase_ppa(struct nvm_dev *dev, struct ppa_addr *ppas, int nr_ppas)
+{
+       struct nvm_rq rqd;
+       int ret;
+
+       if (!dev->ops->erase_block)
+               return 0;
+
+       memset(&rqd, 0, sizeof(struct nvm_rq));
+
+       ret = nvm_set_rqd_ppalist(dev, &rqd, ppas, nr_ppas);
+       if (ret)
+               return ret;
+
+       nvm_generic_to_addr_mode(dev, &rqd);
+
+       ret = dev->ops->erase_block(dev, &rqd);
+
+       nvm_free_rqd_ppalist(dev, &rqd);
+
+       return ret;
+}
+EXPORT_SYMBOL(nvm_erase_ppa);
+
+void nvm_end_io(struct nvm_rq *rqd, int error)
+{
+       rqd->error = error;
+       rqd->end_io(rqd);
+}
+EXPORT_SYMBOL(nvm_end_io);
+
+static void nvm_end_io_sync(struct nvm_rq *rqd)
+{
+       struct completion *waiting = rqd->wait;
+
+       rqd->wait = NULL;
+
+       complete(waiting);
+}
+
+int nvm_submit_ppa(struct nvm_dev *dev, struct ppa_addr *ppa, int nr_ppas,
+                               int opcode, int flags, void *buf, int len)
+{
+       DECLARE_COMPLETION_ONSTACK(wait);
+       struct nvm_rq rqd;
+       struct bio *bio;
+       int ret;
+       unsigned long hang_check;
+
+       bio = bio_map_kern(dev->q, buf, len, GFP_KERNEL);
+       if (IS_ERR_OR_NULL(bio))
+               return -ENOMEM;
+
+       memset(&rqd, 0, sizeof(struct nvm_rq));
+       ret = nvm_set_rqd_ppalist(dev, &rqd, ppa, nr_ppas);
+       if (ret) {
+               bio_put(bio);
+               return ret;
+       }
+
+       rqd.opcode = opcode;
+       rqd.bio = bio;
+       rqd.wait = &wait;
+       rqd.dev = dev;
+       rqd.end_io = nvm_end_io_sync;
+       rqd.flags = flags;
+       nvm_generic_to_addr_mode(dev, &rqd);
+
+       ret = dev->ops->submit_io(dev, &rqd);
+
+       /* Prevent hang_check timer from firing at us during very long I/O */
+       hang_check = sysctl_hung_task_timeout_secs;
+       if (hang_check)
+               while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2)));
+       else
+               wait_for_completion_io(&wait);
+
+       nvm_free_rqd_ppalist(dev, &rqd);
+
+       return rqd.error;
+}
+EXPORT_SYMBOL(nvm_submit_ppa);
+
+static int nvm_init_slc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp)
+{
+       int i;
+
+       dev->lps_per_blk = dev->pgs_per_blk;
+       dev->lptbl = kcalloc(dev->lps_per_blk, sizeof(int), GFP_KERNEL);
+       if (!dev->lptbl)
+               return -ENOMEM;
+
+       /* Just a linear array */
+       for (i = 0; i < dev->lps_per_blk; i++)
+               dev->lptbl[i] = i;
+
+       return 0;
+}
+
+static int nvm_init_mlc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp)
+{
+       int i, p;
+       struct nvm_id_lp_mlc *mlc = &grp->lptbl.mlc;
+
+       if (!mlc->num_pairs)
+               return 0;
+
+       dev->lps_per_blk = mlc->num_pairs;
+       dev->lptbl = kcalloc(dev->lps_per_blk, sizeof(int), GFP_KERNEL);
+       if (!dev->lptbl)
+               return -ENOMEM;
+
+       /* The lower page table encoding consists of a list of bytes, where each
+        * has a lower and an upper half. The first half byte maintains the
+        * increment value and every value after is an offset added to the
+        * previous incrementation value */
+       dev->lptbl[0] = mlc->pairs[0] & 0xF;
+       for (i = 1; i < dev->lps_per_blk; i++) {
+               p = mlc->pairs[i >> 1];
+               if (i & 0x1) /* upper */
+                       dev->lptbl[i] = dev->lptbl[i - 1] + ((p & 0xF0) >> 4);
+               else /* lower */
+                       dev->lptbl[i] = dev->lptbl[i - 1] + (p & 0xF);
+       }
+
+       return 0;
+}
+
 static int nvm_core_init(struct nvm_dev *dev)
 {
        struct nvm_id *id = &dev->identity;
@@ -206,6 +424,7 @@ static int nvm_core_init(struct nvm_dev *dev)
        dev->sec_size = grp->csecs;
        dev->oob_size = grp->sos;
        dev->sec_per_pg = grp->fpg_sz / grp->csecs;
+       dev->mccap = grp->mccap;
        memcpy(&dev->ppaf, &id->ppaf, sizeof(struct nvm_addr_format));
 
        dev->plane_mode = NVM_PLANE_SINGLE;
@@ -216,11 +435,23 @@ static int nvm_core_init(struct nvm_dev *dev)
                return -EINVAL;
        }
 
-       if (grp->fmtype != 0 && grp->fmtype != 1) {
+       switch (grp->fmtype) {
+       case NVM_ID_FMTYPE_SLC:
+               if (nvm_init_slc_tbl(dev, grp))
+                       return -ENOMEM;
+               break;
+       case NVM_ID_FMTYPE_MLC:
+               if (nvm_init_mlc_tbl(dev, grp))
+                       return -ENOMEM;
+               break;
+       default:
                pr_err("nvm: flash type not supported\n");
                return -EINVAL;
        }
 
+       if (!dev->lps_per_blk)
+               pr_info("nvm: lower page programming table missing\n");
+
        if (grp->mpos & 0x020202)
                dev->plane_mode = NVM_PLANE_DOUBLE;
        if (grp->mpos & 0x040404)
@@ -238,6 +469,7 @@ static int nvm_core_init(struct nvm_dev *dev)
                                dev->nr_chnls;
        dev->total_pages = dev->total_blocks * dev->pgs_per_blk;
        INIT_LIST_HEAD(&dev->online_targets);
+       mutex_init(&dev->mlock);
 
        return 0;
 }
@@ -249,6 +481,8 @@ static void nvm_free(struct nvm_dev *dev)
 
        if (dev->mt)
                dev->mt->unregister_mgr(dev);
+
+       kfree(dev->lptbl);
 }
 
 static int nvm_init(struct nvm_dev *dev)
@@ -338,9 +572,16 @@ int nvm_register(struct request_queue *q, char *disk_name,
                }
        }
 
+       ret = nvm_get_sysblock(dev, &dev->sb);
+       if (!ret)
+               pr_err("nvm: device not initialized.\n");
+       else if (ret < 0)
+               pr_err("nvm: err (%d) on device initialization\n", ret);
+
        /* register device with a supported media manager */
        down_write(&nvm_lock);
-       dev->mt = nvm_init_mgr(dev);
+       if (ret > 0)
+               dev->mt = nvm_init_mgr(dev);
        list_add(&dev->devices, &nvm_devices);
        up_write(&nvm_lock);
 
@@ -788,6 +1029,97 @@ static long nvm_ioctl_dev_remove(struct file *file, void __user *arg)
        return __nvm_configure_remove(&remove);
 }
 
+static void nvm_setup_nvm_sb_info(struct nvm_sb_info *info)
+{
+       info->seqnr = 1;
+       info->erase_cnt = 0;
+       info->version = 1;
+}
+
+static long __nvm_ioctl_dev_init(struct nvm_ioctl_dev_init *init)
+{
+       struct nvm_dev *dev;
+       struct nvm_sb_info info;
+       int ret;
+
+       down_write(&nvm_lock);
+       dev = nvm_find_nvm_dev(init->dev);
+       up_write(&nvm_lock);
+       if (!dev) {
+               pr_err("nvm: device not found\n");
+               return -EINVAL;
+       }
+
+       nvm_setup_nvm_sb_info(&info);
+
+       strncpy(info.mmtype, init->mmtype, NVM_MMTYPE_LEN);
+       info.fs_ppa.ppa = -1;
+
+       ret = nvm_init_sysblock(dev, &info);
+       if (ret)
+               return ret;
+
+       memcpy(&dev->sb, &info, sizeof(struct nvm_sb_info));
+
+       down_write(&nvm_lock);
+       dev->mt = nvm_init_mgr(dev);
+       up_write(&nvm_lock);
+
+       return 0;
+}
+
+static long nvm_ioctl_dev_init(struct file *file, void __user *arg)
+{
+       struct nvm_ioctl_dev_init init;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       if (copy_from_user(&init, arg, sizeof(struct nvm_ioctl_dev_init)))
+               return -EFAULT;
+
+       if (init.flags != 0) {
+               pr_err("nvm: no flags supported\n");
+               return -EINVAL;
+       }
+
+       init.dev[DISK_NAME_LEN - 1] = '\0';
+
+       return __nvm_ioctl_dev_init(&init);
+}
+
+static long nvm_ioctl_dev_factory(struct file *file, void __user *arg)
+{
+       struct nvm_ioctl_dev_factory fact;
+       struct nvm_dev *dev;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       if (copy_from_user(&fact, arg, sizeof(struct nvm_ioctl_dev_factory)))
+               return -EFAULT;
+
+       fact.dev[DISK_NAME_LEN - 1] = '\0';
+
+       if (fact.flags & ~(NVM_FACTORY_NR_BITS - 1))
+               return -EINVAL;
+
+       down_write(&nvm_lock);
+       dev = nvm_find_nvm_dev(fact.dev);
+       up_write(&nvm_lock);
+       if (!dev) {
+               pr_err("nvm: device not found\n");
+               return -EINVAL;
+       }
+
+       if (dev->mt) {
+               dev->mt->unregister_mgr(dev);
+               dev->mt = NULL;
+       }
+
+       return nvm_dev_factory(dev, fact.flags);
+}
+
 static long nvm_ctl_ioctl(struct file *file, uint cmd, unsigned long arg)
 {
        void __user *argp = (void __user *)arg;
@@ -801,6 +1133,10 @@ static long nvm_ctl_ioctl(struct file *file, uint cmd, unsigned long arg)
                return nvm_ioctl_dev_create(file, argp);
        case NVM_DEV_REMOVE:
                return nvm_ioctl_dev_remove(file, argp);
+       case NVM_DEV_INIT:
+               return nvm_ioctl_dev_init(file, argp);
+       case NVM_DEV_FACTORY:
+               return nvm_ioctl_dev_factory(file, argp);
        }
        return 0;
 }
index a54b339951a3e695bfa68e96fb6eaf2e3e451c62..7fb725b16148e4142d3767ce711b60450a3ff707 100644 (file)
@@ -60,7 +60,8 @@ static int gennvm_luns_init(struct nvm_dev *dev, struct gen_nvm *gn)
                lun->vlun.lun_id = i % dev->luns_per_chnl;
                lun->vlun.chnl_id = i / dev->luns_per_chnl;
                lun->vlun.nr_free_blocks = dev->blks_per_lun;
-               lun->vlun.nr_inuse_blocks = 0;
+               lun->vlun.nr_open_blocks = 0;
+               lun->vlun.nr_closed_blocks = 0;
                lun->vlun.nr_bad_blocks = 0;
        }
        return 0;
@@ -89,6 +90,7 @@ static int gennvm_block_bb(struct ppa_addr ppa, int nr_blocks, u8 *blks,
 
                list_move_tail(&blk->list, &lun->bb_list);
                lun->vlun.nr_bad_blocks++;
+               lun->vlun.nr_free_blocks--;
        }
 
        return 0;
@@ -133,15 +135,15 @@ static int gennvm_block_map(u64 slba, u32 nlb, __le64 *entries, void *private)
                pba = pba - (dev->sec_per_lun * lun_id);
                blk = &lun->vlun.blocks[div_u64(pba, dev->sec_per_blk)];
 
-               if (!blk->type) {
+               if (!blk->state) {
                        /* at this point, we don't know anything about the
                         * block. It's up to the FTL on top to re-etablish the
-                        * block state
+                        * block state. The block is assumed to be open.
                         */
                        list_move_tail(&blk->list, &lun->used_list);
-                       blk->type = 1;
+                       blk->state = NVM_BLK_ST_OPEN;
                        lun->vlun.nr_free_blocks--;
-                       lun->vlun.nr_inuse_blocks++;
+                       lun->vlun.nr_open_blocks++;
                }
        }
 
@@ -255,14 +257,14 @@ static void gennvm_unregister(struct nvm_dev *dev)
        module_put(THIS_MODULE);
 }
 
-static struct nvm_block *gennvm_get_blk(struct nvm_dev *dev,
+static struct nvm_block *gennvm_get_blk_unlocked(struct nvm_dev *dev,
                                struct nvm_lun *vlun, unsigned long flags)
 {
        struct gen_lun *lun = container_of(vlun, struct gen_lun, vlun);
        struct nvm_block *blk = NULL;
        int is_gc = flags & NVM_IOTYPE_GC;
 
-       spin_lock(&vlun->lock);
+       assert_spin_locked(&vlun->lock);
 
        if (list_empty(&lun->free_list)) {
                pr_err_ratelimited("gennvm: lun %u have no free pages available",
@@ -275,83 +277,64 @@ static struct nvm_block *gennvm_get_blk(struct nvm_dev *dev,
 
        blk = list_first_entry(&lun->free_list, struct nvm_block, list);
        list_move_tail(&blk->list, &lun->used_list);
-       blk->type = 1;
+       blk->state = NVM_BLK_ST_OPEN;
 
        lun->vlun.nr_free_blocks--;
-       lun->vlun.nr_inuse_blocks++;
+       lun->vlun.nr_open_blocks++;
 
 out:
+       return blk;
+}
+
+static struct nvm_block *gennvm_get_blk(struct nvm_dev *dev,
+                               struct nvm_lun *vlun, unsigned long flags)
+{
+       struct nvm_block *blk;
+
+       spin_lock(&vlun->lock);
+       blk = gennvm_get_blk_unlocked(dev, vlun, flags);
        spin_unlock(&vlun->lock);
        return blk;
 }
 
-static void gennvm_put_blk(struct nvm_dev *dev, struct nvm_block *blk)
+static void gennvm_put_blk_unlocked(struct nvm_dev *dev, struct nvm_block *blk)
 {
        struct nvm_lun *vlun = blk->lun;
        struct gen_lun *lun = container_of(vlun, struct gen_lun, vlun);
 
-       spin_lock(&vlun->lock);
+       assert_spin_locked(&vlun->lock);
 
-       switch (blk->type) {
-       case 1:
+       if (blk->state & NVM_BLK_ST_OPEN) {
                list_move_tail(&blk->list, &lun->free_list);
+               lun->vlun.nr_open_blocks--;
                lun->vlun.nr_free_blocks++;
-               lun->vlun.nr_inuse_blocks--;
-               blk->type = 0;
-               break;
-       case 2:
+               blk->state = NVM_BLK_ST_FREE;
+       } else if (blk->state & NVM_BLK_ST_CLOSED) {
+               list_move_tail(&blk->list, &lun->free_list);
+               lun->vlun.nr_closed_blocks--;
+               lun->vlun.nr_free_blocks++;
+               blk->state = NVM_BLK_ST_FREE;
+       } else if (blk->state & NVM_BLK_ST_BAD) {
                list_move_tail(&blk->list, &lun->bb_list);
                lun->vlun.nr_bad_blocks++;
-               lun->vlun.nr_inuse_blocks--;
-               break;
-       default:
+               blk->state = NVM_BLK_ST_BAD;
+       } else {
                WARN_ON_ONCE(1);
                pr_err("gennvm: erroneous block type (%lu -> %u)\n",
-                                                       blk->id, blk->type);
+                                                       blk->id, blk->state);
                list_move_tail(&blk->list, &lun->bb_list);
                lun->vlun.nr_bad_blocks++;
-               lun->vlun.nr_inuse_blocks--;
-       }
-
-       spin_unlock(&vlun->lock);
-}
-
-static void gennvm_addr_to_generic_mode(struct nvm_dev *dev, struct nvm_rq *rqd)
-{
-       int i;
-
-       if (rqd->nr_pages > 1) {
-               for (i = 0; i < rqd->nr_pages; i++)
-                       rqd->ppa_list[i] = dev_to_generic_addr(dev,
-                                                       rqd->ppa_list[i]);
-       } else {
-               rqd->ppa_addr = dev_to_generic_addr(dev, rqd->ppa_addr);
+               blk->state = NVM_BLK_ST_BAD;
        }
 }
 
-static void gennvm_generic_to_addr_mode(struct nvm_dev *dev, struct nvm_rq *rqd)
-{
-       int i;
-
-       if (rqd->nr_pages > 1) {
-               for (i = 0; i < rqd->nr_pages; i++)
-                       rqd->ppa_list[i] = generic_to_dev_addr(dev,
-                                                       rqd->ppa_list[i]);
-       } else {
-               rqd->ppa_addr = generic_to_dev_addr(dev, rqd->ppa_addr);
-       }
-}
-
-static int gennvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
+static void gennvm_put_blk(struct nvm_dev *dev, struct nvm_block *blk)
 {
-       if (!dev->ops->submit_io)
-               return 0;
-
-       /* Convert address space */
-       gennvm_generic_to_addr_mode(dev, rqd);
+       struct nvm_lun *vlun = blk->lun;
 
-       rqd->dev = dev;
-       return dev->ops->submit_io(dev, rqd);
+       spin_lock(&vlun->lock);
+       gennvm_put_blk_unlocked(dev, blk);
+       spin_unlock(&vlun->lock);
 }
 
 static void gennvm_blk_set_type(struct nvm_dev *dev, struct ppa_addr *ppa,
@@ -376,7 +359,7 @@ static void gennvm_blk_set_type(struct nvm_dev *dev, struct ppa_addr *ppa,
        blk = &lun->vlun.blocks[ppa->g.blk];
 
        /* will be moved to bb list on put_blk from target */
-       blk->type = type;
+       blk->state = type;
 }
 
 /* mark block bad. It is expected the target recover from the error. */
@@ -390,77 +373,51 @@ static void gennvm_mark_blk_bad(struct nvm_dev *dev, struct nvm_rq *rqd)
        if (dev->ops->set_bb_tbl(dev, rqd, 1))
                return;
 
-       gennvm_addr_to_generic_mode(dev, rqd);
+       nvm_addr_to_generic_mode(dev, rqd);
 
        /* look up blocks and mark them as bad */
        if (rqd->nr_pages > 1)
                for (i = 0; i < rqd->nr_pages; i++)
-                       gennvm_blk_set_type(dev, &rqd->ppa_list[i], 2);
+                       gennvm_blk_set_type(dev, &rqd->ppa_list[i],
+                                               NVM_BLK_ST_BAD);
        else
-               gennvm_blk_set_type(dev, &rqd->ppa_addr, 2);
+               gennvm_blk_set_type(dev, &rqd->ppa_addr, NVM_BLK_ST_BAD);
 }
 
-static int gennvm_end_io(struct nvm_rq *rqd, int error)
+static void gennvm_end_io(struct nvm_rq *rqd)
 {
        struct nvm_tgt_instance *ins = rqd->ins;
-       int ret = 0;
 
-       switch (error) {
+       switch (rqd->error) {
        case NVM_RSP_SUCCESS:
-               break;
        case NVM_RSP_ERR_EMPTYPAGE:
                break;
        case NVM_RSP_ERR_FAILWRITE:
                gennvm_mark_blk_bad(rqd->dev, rqd);
-       default:
-               ret++;
        }
 
-       ret += ins->tt->end_io(rqd, error);
-
-       return ret;
+       ins->tt->end_io(rqd);
 }
 
-static int gennvm_erase_blk(struct nvm_dev *dev, struct nvm_block *blk,
-                                                       unsigned long flags)
+static int gennvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
 {
-       int plane_cnt = 0, pl_idx, ret;
-       struct ppa_addr addr;
-       struct nvm_rq rqd;
-
-       if (!dev->ops->erase_block)
-               return 0;
-
-       addr = block_to_ppa(dev, blk);
-
-       if (dev->plane_mode == NVM_PLANE_SINGLE) {
-               rqd.nr_pages = 1;
-               rqd.ppa_addr = addr;
-       } else {
-               plane_cnt = (1 << dev->plane_mode);
-               rqd.nr_pages = plane_cnt;
-
-               rqd.ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL,
-                                                       &rqd.dma_ppa_list);
-               if (!rqd.ppa_list) {
-                       pr_err("gennvm: failed to allocate dma memory\n");
-                       return -ENOMEM;
-               }
-
-               for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) {
-                       addr.g.pl = pl_idx;
-                       rqd.ppa_list[pl_idx] = addr;
-               }
-       }
+       if (!dev->ops->submit_io)
+               return -ENODEV;
 
-       gennvm_generic_to_addr_mode(dev, &rqd);
+       /* Convert address space */
+       nvm_generic_to_addr_mode(dev, rqd);
 
-       ret = dev->ops->erase_block(dev, &rqd);
+       rqd->dev = dev;
+       rqd->end_io = gennvm_end_io;
+       return dev->ops->submit_io(dev, rqd);
+}
 
-       if (plane_cnt)
-               nvm_dev_dma_free(dev, rqd.ppa_list, rqd.dma_ppa_list);
+static int gennvm_erase_blk(struct nvm_dev *dev, struct nvm_block *blk,
+                                                       unsigned long flags)
+{
+       struct ppa_addr addr = block_to_ppa(dev, blk);
 
-       return ret;
+       return nvm_erase_ppa(dev, &addr, 1);
 }
 
 static struct nvm_lun *gennvm_get_lun(struct nvm_dev *dev, int lunid)
@@ -480,10 +437,11 @@ static void gennvm_lun_info_print(struct nvm_dev *dev)
        gennvm_for_each_lun(gn, lun, i) {
                spin_lock(&lun->vlun.lock);
 
-               pr_info("%s: lun%8u\t%u\t%u\t%u\n",
+               pr_info("%s: lun%8u\t%u\t%u\t%u\t%u\n",
                                dev->name, i,
                                lun->vlun.nr_free_blocks,
-                               lun->vlun.nr_inuse_blocks,
+                               lun->vlun.nr_open_blocks,
+                               lun->vlun.nr_closed_blocks,
                                lun->vlun.nr_bad_blocks);
 
                spin_unlock(&lun->vlun.lock);
@@ -491,21 +449,23 @@ static void gennvm_lun_info_print(struct nvm_dev *dev)
 }
 
 static struct nvmm_type gennvm = {
-       .name           = "gennvm",
-       .version        = {0, 1, 0},
+       .name                   = "gennvm",
+       .version                = {0, 1, 0},
+
+       .register_mgr           = gennvm_register,
+       .unregister_mgr         = gennvm_unregister,
 
-       .register_mgr   = gennvm_register,
-       .unregister_mgr = gennvm_unregister,
+       .get_blk_unlocked       = gennvm_get_blk_unlocked,
+       .put_blk_unlocked       = gennvm_put_blk_unlocked,
 
-       .get_blk        = gennvm_get_blk,
-       .put_blk        = gennvm_put_blk,
+       .get_blk                = gennvm_get_blk,
+       .put_blk                = gennvm_put_blk,
 
-       .submit_io      = gennvm_submit_io,
-       .end_io         = gennvm_end_io,
-       .erase_blk      = gennvm_erase_blk,
+       .submit_io              = gennvm_submit_io,
+       .erase_blk              = gennvm_erase_blk,
 
-       .get_lun        = gennvm_get_lun,
-       .lun_info_print = gennvm_lun_info_print,
+       .get_lun                = gennvm_get_lun,
+       .lun_info_print         = gennvm_lun_info_print,
 };
 
 static int __init gennvm_module_init(void)
index 134e4faba48259434db7d55d446f40f3deb206b9..d8c75958ced346d5aede0398a8f4b50bd96fa749 100644 (file)
@@ -179,16 +179,23 @@ static void rrpc_set_lun_cur(struct rrpc_lun *rlun, struct rrpc_block *rblk)
 static struct rrpc_block *rrpc_get_blk(struct rrpc *rrpc, struct rrpc_lun *rlun,
                                                        unsigned long flags)
 {
+       struct nvm_lun *lun = rlun->parent;
        struct nvm_block *blk;
        struct rrpc_block *rblk;
 
-       blk = nvm_get_blk(rrpc->dev, rlun->parent, flags);
-       if (!blk)
+       spin_lock(&lun->lock);
+       blk = nvm_get_blk_unlocked(rrpc->dev, rlun->parent, flags);
+       if (!blk) {
+               pr_err("nvm: rrpc: cannot get new block from media manager\n");
+               spin_unlock(&lun->lock);
                return NULL;
+       }
 
        rblk = &rlun->blocks[blk->id];
-       blk->priv = rblk;
+       list_add_tail(&rblk->list, &rlun->open_list);
+       spin_unlock(&lun->lock);
 
+       blk->priv = rblk;
        bitmap_zero(rblk->invalid_pages, rrpc->dev->pgs_per_blk);
        rblk->next_page = 0;
        rblk->nr_invalid_pages = 0;
@@ -199,7 +206,13 @@ static struct rrpc_block *rrpc_get_blk(struct rrpc *rrpc, struct rrpc_lun *rlun,
 
 static void rrpc_put_blk(struct rrpc *rrpc, struct rrpc_block *rblk)
 {
-       nvm_put_blk(rrpc->dev, rblk->parent);
+       struct rrpc_lun *rlun = rblk->rlun;
+       struct nvm_lun *lun = rlun->parent;
+
+       spin_lock(&lun->lock);
+       nvm_put_blk_unlocked(rrpc->dev, rblk->parent);
+       list_del(&rblk->list);
+       spin_unlock(&lun->lock);
 }
 
 static void rrpc_put_blks(struct rrpc *rrpc)
@@ -287,6 +300,8 @@ static int rrpc_move_valid_pages(struct rrpc *rrpc, struct rrpc_block *rblk)
        }
 
        page = mempool_alloc(rrpc->page_pool, GFP_NOIO);
+       if (!page)
+               return -ENOMEM;
 
        while ((slot = find_first_zero_bit(rblk->invalid_pages,
                                            nr_pgs_per_blk)) < nr_pgs_per_blk) {
@@ -328,6 +343,10 @@ try:
                        goto finished;
                }
                wait_for_completion_io(&wait);
+               if (bio->bi_error) {
+                       rrpc_inflight_laddr_release(rrpc, rqd);
+                       goto finished;
+               }
 
                bio_reset(bio);
                reinit_completion(&wait);
@@ -350,6 +369,8 @@ try:
                wait_for_completion_io(&wait);
 
                rrpc_inflight_laddr_release(rrpc, rqd);
+               if (bio->bi_error)
+                       goto finished;
 
                bio_reset(bio);
        }
@@ -373,16 +394,26 @@ static void rrpc_block_gc(struct work_struct *work)
        struct rrpc *rrpc = gcb->rrpc;
        struct rrpc_block *rblk = gcb->rblk;
        struct nvm_dev *dev = rrpc->dev;
+       struct nvm_lun *lun = rblk->parent->lun;
+       struct rrpc_lun *rlun = &rrpc->luns[lun->id - rrpc->lun_offset];
 
+       mempool_free(gcb, rrpc->gcb_pool);
        pr_debug("nvm: block '%lu' being reclaimed\n", rblk->parent->id);
 
        if (rrpc_move_valid_pages(rrpc, rblk))
-               goto done;
+               goto put_back;
+
+       if (nvm_erase_blk(dev, rblk->parent))
+               goto put_back;
 
-       nvm_erase_blk(dev, rblk->parent);
        rrpc_put_blk(rrpc, rblk);
-done:
-       mempool_free(gcb, rrpc->gcb_pool);
+
+       return;
+
+put_back:
+       spin_lock(&rlun->lock);
+       list_add_tail(&rblk->prio, &rlun->prio_list);
+       spin_unlock(&rlun->lock);
 }
 
 /* the block with highest number of invalid pages, will be in the beginning
@@ -427,7 +458,7 @@ static void rrpc_lun_gc(struct work_struct *work)
        if (nr_blocks_need < rrpc->nr_luns)
                nr_blocks_need = rrpc->nr_luns;
 
-       spin_lock(&lun->lock);
+       spin_lock(&rlun->lock);
        while (nr_blocks_need > lun->nr_free_blocks &&
                                        !list_empty(&rlun->prio_list)) {
                struct rrpc_block *rblock = block_prio_find_max(rlun);
@@ -436,16 +467,16 @@ static void rrpc_lun_gc(struct work_struct *work)
                if (!rblock->nr_invalid_pages)
                        break;
 
+               gcb = mempool_alloc(rrpc->gcb_pool, GFP_ATOMIC);
+               if (!gcb)
+                       break;
+
                list_del_init(&rblock->prio);
 
                BUG_ON(!block_is_full(rrpc, rblock));
 
                pr_debug("rrpc: selected block '%lu' for GC\n", block->id);
 
-               gcb = mempool_alloc(rrpc->gcb_pool, GFP_ATOMIC);
-               if (!gcb)
-                       break;
-
                gcb->rrpc = rrpc;
                gcb->rblk = rblock;
                INIT_WORK(&gcb->ws_gc, rrpc_block_gc);
@@ -454,7 +485,7 @@ static void rrpc_lun_gc(struct work_struct *work)
 
                nr_blocks_need--;
        }
-       spin_unlock(&lun->lock);
+       spin_unlock(&rlun->lock);
 
        /* TODO: Hint that request queue can be started again */
 }
@@ -635,12 +666,24 @@ static void rrpc_end_io_write(struct rrpc *rrpc, struct rrpc_rq *rrqd,
                lun = rblk->parent->lun;
 
                cmnt_size = atomic_inc_return(&rblk->data_cmnt_size);
-               if (unlikely(cmnt_size == rrpc->dev->pgs_per_blk))
+               if (unlikely(cmnt_size == rrpc->dev->pgs_per_blk)) {
+                       struct nvm_block *blk = rblk->parent;
+                       struct rrpc_lun *rlun = rblk->rlun;
+
+                       spin_lock(&lun->lock);
+                       lun->nr_open_blocks--;
+                       lun->nr_closed_blocks++;
+                       blk->state &= ~NVM_BLK_ST_OPEN;
+                       blk->state |= NVM_BLK_ST_CLOSED;
+                       list_move_tail(&rblk->list, &rlun->closed_list);
+                       spin_unlock(&lun->lock);
+
                        rrpc_run_gc(rrpc, rblk);
+               }
        }
 }
 
-static int rrpc_end_io(struct nvm_rq *rqd, int error)
+static void rrpc_end_io(struct nvm_rq *rqd)
 {
        struct rrpc *rrpc = container_of(rqd->ins, struct rrpc, instance);
        struct rrpc_rq *rrqd = nvm_rq_to_pdu(rqd);
@@ -650,11 +693,12 @@ static int rrpc_end_io(struct nvm_rq *rqd, int error)
        if (bio_data_dir(rqd->bio) == WRITE)
                rrpc_end_io_write(rrpc, rrqd, laddr, npages);
 
+       bio_put(rqd->bio);
+
        if (rrqd->flags & NVM_IOTYPE_GC)
-               return 0;
+               return;
 
        rrpc_unlock_rq(rrpc, rqd);
-       bio_put(rqd->bio);
 
        if (npages > 1)
                nvm_dev_dma_free(rrpc->dev, rqd->ppa_list, rqd->dma_ppa_list);
@@ -662,8 +706,6 @@ static int rrpc_end_io(struct nvm_rq *rqd, int error)
                nvm_dev_dma_free(rrpc->dev, rqd->metadata, rqd->dma_metadata);
 
        mempool_free(rqd, rrpc->rq_pool);
-
-       return 0;
 }
 
 static int rrpc_read_ppalist_rq(struct rrpc *rrpc, struct bio *bio,
@@ -841,6 +883,13 @@ static int rrpc_submit_io(struct rrpc *rrpc, struct bio *bio,
        err = nvm_submit_io(rrpc->dev, rqd);
        if (err) {
                pr_err("rrpc: I/O submission failed: %d\n", err);
+               bio_put(bio);
+               if (!(flags & NVM_IOTYPE_GC)) {
+                       rrpc_unlock_rq(rrpc, rqd);
+                       if (rqd->nr_pages > 1)
+                               nvm_dev_dma_free(rrpc->dev,
+                       rqd->ppa_list, rqd->dma_ppa_list);
+               }
                return NVM_IO_ERR;
        }
 
@@ -1090,6 +1139,11 @@ static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end)
        struct rrpc_lun *rlun;
        int i, j;
 
+       if (dev->pgs_per_blk > MAX_INVALID_PAGES_STORAGE * BITS_PER_LONG) {
+               pr_err("rrpc: number of pages per block too high.");
+               return -EINVAL;
+       }
+
        spin_lock_init(&rrpc->rev_lock);
 
        rrpc->luns = kcalloc(rrpc->nr_luns, sizeof(struct rrpc_lun),
@@ -1101,16 +1155,13 @@ static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end)
        for (i = 0; i < rrpc->nr_luns; i++) {
                struct nvm_lun *lun = dev->mt->get_lun(dev, lun_begin + i);
 
-               if (dev->pgs_per_blk >
-                               MAX_INVALID_PAGES_STORAGE * BITS_PER_LONG) {
-                       pr_err("rrpc: number of pages per block too high.");
-                       goto err;
-               }
-
                rlun = &rrpc->luns[i];
                rlun->rrpc = rrpc;
                rlun->parent = lun;
                INIT_LIST_HEAD(&rlun->prio_list);
+               INIT_LIST_HEAD(&rlun->open_list);
+               INIT_LIST_HEAD(&rlun->closed_list);
+
                INIT_WORK(&rlun->ws_gc, rrpc_lun_gc);
                spin_lock_init(&rlun->lock);
 
@@ -1127,6 +1178,7 @@ static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end)
                        struct nvm_block *blk = &lun->blocks[j];
 
                        rblk->parent = blk;
+                       rblk->rlun = rlun;
                        INIT_LIST_HEAD(&rblk->prio);
                        spin_lock_init(&rblk->lock);
                }
index a9696a06c38c79c91821aa9f0104dd41ce3dbf00..ef13ac7700c80e352b8db97fed8c1e38ba504cc9 100644 (file)
@@ -54,7 +54,9 @@ struct rrpc_rq {
 
 struct rrpc_block {
        struct nvm_block *parent;
+       struct rrpc_lun *rlun;
        struct list_head prio;
+       struct list_head list;
 
 #define MAX_INVALID_PAGES_STORAGE 8
        /* Bitmap for invalid page intries */
@@ -73,7 +75,16 @@ struct rrpc_lun {
        struct nvm_lun *parent;
        struct rrpc_block *cur, *gc_cur;
        struct rrpc_block *blocks;      /* Reference to block allocation */
-       struct list_head prio_list;             /* Blocks that may be GC'ed */
+
+       struct list_head prio_list;     /* Blocks that may be GC'ed */
+       struct list_head open_list;     /* In-use open blocks. These are blocks
+                                        * that can be both written to and read
+                                        * from
+                                        */
+       struct list_head closed_list;   /* In-use closed blocks. These are
+                                        * blocks that can _only_ be read from
+                                        */
+
        struct work_struct ws_gc;
 
        spinlock_t lock;
diff --git a/drivers/lightnvm/sysblk.c b/drivers/lightnvm/sysblk.c
new file mode 100644 (file)
index 0000000..321de1f
--- /dev/null
@@ -0,0 +1,741 @@
+/*
+ * Copyright (C) 2015 Matias Bjorling. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
+ * USA.
+ *
+ */
+
+#include <linux/lightnvm.h>
+
+#define MAX_SYSBLKS 3  /* remember to update mapping scheme on change */
+#define MAX_BLKS_PR_SYSBLK 2 /* 2 blks with 256 pages and 3000 erases
+                             * enables ~1.5M updates per sysblk unit
+                             */
+
+struct sysblk_scan {
+       /* A row is a collection of flash blocks for a system block. */
+       int nr_rows;
+       int row;
+       int act_blk[MAX_SYSBLKS];
+
+       int nr_ppas;
+       struct ppa_addr ppas[MAX_SYSBLKS * MAX_BLKS_PR_SYSBLK];/* all sysblks */
+};
+
+static inline int scan_ppa_idx(int row, int blkid)
+{
+       return (row * MAX_BLKS_PR_SYSBLK) + blkid;
+}
+
+void nvm_sysblk_to_cpu(struct nvm_sb_info *info, struct nvm_system_block *sb)
+{
+       info->seqnr = be32_to_cpu(sb->seqnr);
+       info->erase_cnt = be32_to_cpu(sb->erase_cnt);
+       info->version = be16_to_cpu(sb->version);
+       strncpy(info->mmtype, sb->mmtype, NVM_MMTYPE_LEN);
+       info->fs_ppa.ppa = be64_to_cpu(sb->fs_ppa);
+}
+
+void nvm_cpu_to_sysblk(struct nvm_system_block *sb, struct nvm_sb_info *info)
+{
+       sb->magic = cpu_to_be32(NVM_SYSBLK_MAGIC);
+       sb->seqnr = cpu_to_be32(info->seqnr);
+       sb->erase_cnt = cpu_to_be32(info->erase_cnt);
+       sb->version = cpu_to_be16(info->version);
+       strncpy(sb->mmtype, info->mmtype, NVM_MMTYPE_LEN);
+       sb->fs_ppa = cpu_to_be64(info->fs_ppa.ppa);
+}
+
+static int nvm_setup_sysblks(struct nvm_dev *dev, struct ppa_addr *sysblk_ppas)
+{
+       int nr_rows = min_t(int, MAX_SYSBLKS, dev->nr_chnls);
+       int i;
+
+       for (i = 0; i < nr_rows; i++)
+               sysblk_ppas[i].ppa = 0;
+
+       /* if possible, place sysblk at first channel, middle channel and last
+        * channel of the device. If not, create only one or two sys blocks
+        */
+       switch (dev->nr_chnls) {
+       case 2:
+               sysblk_ppas[1].g.ch = 1;
+               /* fall-through */
+       case 1:
+               sysblk_ppas[0].g.ch = 0;
+               break;
+       default:
+               sysblk_ppas[0].g.ch = 0;
+               sysblk_ppas[1].g.ch = dev->nr_chnls / 2;
+               sysblk_ppas[2].g.ch = dev->nr_chnls - 1;
+               break;
+       }
+
+       return nr_rows;
+}
+
+void nvm_setup_sysblk_scan(struct nvm_dev *dev, struct sysblk_scan *s,
+                                               struct ppa_addr *sysblk_ppas)
+{
+       memset(s, 0, sizeof(struct sysblk_scan));
+       s->nr_rows = nvm_setup_sysblks(dev, sysblk_ppas);
+}
+
+static int sysblk_get_host_blks(struct ppa_addr ppa, int nr_blks, u8 *blks,
+                                                               void *private)
+{
+       struct sysblk_scan *s = private;
+       int i, nr_sysblk = 0;
+
+       for (i = 0; i < nr_blks; i++) {
+               if (blks[i] != NVM_BLK_T_HOST)
+                       continue;
+
+               if (s->nr_ppas == MAX_BLKS_PR_SYSBLK * MAX_SYSBLKS) {
+                       pr_err("nvm: too many host blks\n");
+                       return -EINVAL;
+               }
+
+               ppa.g.blk = i;
+
+               s->ppas[scan_ppa_idx(s->row, nr_sysblk)] = ppa;
+               s->nr_ppas++;
+               nr_sysblk++;
+       }
+
+       return 0;
+}
+
+static int nvm_get_all_sysblks(struct nvm_dev *dev, struct sysblk_scan *s,
+                               struct ppa_addr *ppas, nvm_bb_update_fn *fn)
+{
+       struct ppa_addr dppa;
+       int i, ret;
+
+       s->nr_ppas = 0;
+
+       for (i = 0; i < s->nr_rows; i++) {
+               dppa = generic_to_dev_addr(dev, ppas[i]);
+               s->row = i;
+
+               ret = dev->ops->get_bb_tbl(dev, dppa, dev->blks_per_lun, fn, s);
+               if (ret) {
+                       pr_err("nvm: failed bb tbl for ppa (%u %u)\n",
+                                                       ppas[i].g.ch,
+                                                       ppas[i].g.blk);
+                       return ret;
+               }
+       }
+
+       return ret;
+}
+
+/*
+ * scans a block for latest sysblk.
+ * Returns:
+ *     0 - newer sysblk not found. PPA is updated to latest page.
+ *     1 - newer sysblk found and stored in *cur. PPA is updated to
+ *         next valid page.
+ *     <0- error.
+ */
+static int nvm_scan_block(struct nvm_dev *dev, struct ppa_addr *ppa,
+                                               struct nvm_system_block *sblk)
+{
+       struct nvm_system_block *cur;
+       int pg, cursz, ret, found = 0;
+
+       /* the full buffer for a flash page is allocated. Only the first of it
+        * contains the system block information
+        */
+       cursz = dev->sec_size * dev->sec_per_pg * dev->nr_planes;
+       cur = kmalloc(cursz, GFP_KERNEL);
+       if (!cur)
+               return -ENOMEM;
+
+       /* perform linear scan through the block */
+       for (pg = 0; pg < dev->lps_per_blk; pg++) {
+               ppa->g.pg = ppa_to_slc(dev, pg);
+
+               ret = nvm_submit_ppa(dev, ppa, 1, NVM_OP_PREAD, NVM_IO_SLC_MODE,
+                                                               cur, cursz);
+               if (ret) {
+                       if (ret == NVM_RSP_ERR_EMPTYPAGE) {
+                               pr_debug("nvm: sysblk scan empty ppa (%u %u %u %u)\n",
+                                                       ppa->g.ch,
+                                                       ppa->g.lun,
+                                                       ppa->g.blk,
+                                                       ppa->g.pg);
+                               break;
+                       }
+                       pr_err("nvm: read failed (%x) for ppa (%u %u %u %u)",
+                                                       ret,
+                                                       ppa->g.ch,
+                                                       ppa->g.lun,
+                                                       ppa->g.blk,
+                                                       ppa->g.pg);
+                       break; /* if we can't read a page, continue to the
+                               * next blk
+                               */
+               }
+
+               if (be32_to_cpu(cur->magic) != NVM_SYSBLK_MAGIC) {
+                       pr_debug("nvm: scan break for ppa (%u %u %u %u)\n",
+                                                       ppa->g.ch,
+                                                       ppa->g.lun,
+                                                       ppa->g.blk,
+                                                       ppa->g.pg);
+                       break; /* last valid page already found */
+               }
+
+               if (be32_to_cpu(cur->seqnr) < be32_to_cpu(sblk->seqnr))
+                       continue;
+
+               memcpy(sblk, cur, sizeof(struct nvm_system_block));
+               found = 1;
+       }
+
+       kfree(cur);
+
+       return found;
+}
+
+static int nvm_set_bb_tbl(struct nvm_dev *dev, struct sysblk_scan *s, int type)
+{
+       struct nvm_rq rqd;
+       int ret;
+
+       if (s->nr_ppas > dev->ops->max_phys_sect) {
+               pr_err("nvm: unable to update all sysblocks atomically\n");
+               return -EINVAL;
+       }
+
+       memset(&rqd, 0, sizeof(struct nvm_rq));
+
+       nvm_set_rqd_ppalist(dev, &rqd, s->ppas, s->nr_ppas);
+       nvm_generic_to_addr_mode(dev, &rqd);
+
+       ret = dev->ops->set_bb_tbl(dev, &rqd, type);
+       nvm_free_rqd_ppalist(dev, &rqd);
+       if (ret) {
+               pr_err("nvm: sysblk failed bb mark\n");
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static int sysblk_get_free_blks(struct ppa_addr ppa, int nr_blks, u8 *blks,
+                                                               void *private)
+{
+       struct sysblk_scan *s = private;
+       struct ppa_addr *sppa;
+       int i, blkid = 0;
+
+       for (i = 0; i < nr_blks; i++) {
+               if (blks[i] == NVM_BLK_T_HOST)
+                       return -EEXIST;
+
+               if (blks[i] != NVM_BLK_T_FREE)
+                       continue;
+
+               sppa = &s->ppas[scan_ppa_idx(s->row, blkid)];
+               sppa->g.ch = ppa.g.ch;
+               sppa->g.lun = ppa.g.lun;
+               sppa->g.blk = i;
+               s->nr_ppas++;
+               blkid++;
+
+               pr_debug("nvm: use (%u %u %u) as sysblk\n",
+                                       sppa->g.ch, sppa->g.lun, sppa->g.blk);
+               if (blkid > MAX_BLKS_PR_SYSBLK - 1)
+                       return 0;
+       }
+
+       pr_err("nvm: sysblk failed get sysblk\n");
+       return -EINVAL;
+}
+
+static int nvm_write_and_verify(struct nvm_dev *dev, struct nvm_sb_info *info,
+                                                       struct sysblk_scan *s)
+{
+       struct nvm_system_block nvmsb;
+       void *buf;
+       int i, sect, ret, bufsz;
+       struct ppa_addr *ppas;
+
+       nvm_cpu_to_sysblk(&nvmsb, info);
+
+       /* buffer for flash page */
+       bufsz = dev->sec_size * dev->sec_per_pg * dev->nr_planes;
+       buf = kzalloc(bufsz, GFP_KERNEL);
+       if (!buf)
+               return -ENOMEM;
+       memcpy(buf, &nvmsb, sizeof(struct nvm_system_block));
+
+       ppas = kcalloc(dev->sec_per_pg, sizeof(struct ppa_addr), GFP_KERNEL);
+       if (!ppas) {
+               ret = -ENOMEM;
+               goto err;
+       }
+
+       /* Write and verify */
+       for (i = 0; i < s->nr_rows; i++) {
+               ppas[0] = s->ppas[scan_ppa_idx(i, s->act_blk[i])];
+
+               pr_debug("nvm: writing sysblk to ppa (%u %u %u %u)\n",
+                                                       ppas[0].g.ch,
+                                                       ppas[0].g.lun,
+                                                       ppas[0].g.blk,
+                                                       ppas[0].g.pg);
+
+               /* Expand to all sectors within a flash page */
+               if (dev->sec_per_pg > 1) {
+                       for (sect = 1; sect < dev->sec_per_pg; sect++) {
+                               ppas[sect].ppa = ppas[0].ppa;
+                               ppas[sect].g.sec = sect;
+                       }
+               }
+
+               ret = nvm_submit_ppa(dev, ppas, dev->sec_per_pg, NVM_OP_PWRITE,
+                                               NVM_IO_SLC_MODE, buf, bufsz);
+               if (ret) {
+                       pr_err("nvm: sysblk failed program (%u %u %u)\n",
+                                                       ppas[0].g.ch,
+                                                       ppas[0].g.lun,
+                                                       ppas[0].g.blk);
+                       break;
+               }
+
+               ret = nvm_submit_ppa(dev, ppas, dev->sec_per_pg, NVM_OP_PREAD,
+                                               NVM_IO_SLC_MODE, buf, bufsz);
+               if (ret) {
+                       pr_err("nvm: sysblk failed read (%u %u %u)\n",
+                                                       ppas[0].g.ch,
+                                                       ppas[0].g.lun,
+                                                       ppas[0].g.blk);
+                       break;
+               }
+
+               if (memcmp(buf, &nvmsb, sizeof(struct nvm_system_block))) {
+                       pr_err("nvm: sysblk failed verify (%u %u %u)\n",
+                                                       ppas[0].g.ch,
+                                                       ppas[0].g.lun,
+                                                       ppas[0].g.blk);
+                       ret = -EINVAL;
+                       break;
+               }
+       }
+
+       kfree(ppas);
+err:
+       kfree(buf);
+
+       return ret;
+}
+
+static int nvm_prepare_new_sysblks(struct nvm_dev *dev, struct sysblk_scan *s)
+{
+       int i, ret;
+       unsigned long nxt_blk;
+       struct ppa_addr *ppa;
+
+       for (i = 0; i < s->nr_rows; i++) {
+               nxt_blk = (s->act_blk[i] + 1) % MAX_BLKS_PR_SYSBLK;
+               ppa = &s->ppas[scan_ppa_idx(i, nxt_blk)];
+               ppa->g.pg = ppa_to_slc(dev, 0);
+
+               ret = nvm_erase_ppa(dev, ppa, 1);
+               if (ret)
+                       return ret;
+
+               s->act_blk[i] = nxt_blk;
+       }
+
+       return 0;
+}
+
+int nvm_get_sysblock(struct nvm_dev *dev, struct nvm_sb_info *info)
+{
+       struct ppa_addr sysblk_ppas[MAX_SYSBLKS];
+       struct sysblk_scan s;
+       struct nvm_system_block *cur;
+       int i, j, found = 0;
+       int ret = -ENOMEM;
+
+       /*
+        * 1. setup sysblk locations
+        * 2. get bad block list
+        * 3. filter on host-specific (type 3)
+        * 4. iterate through all and find the highest seq nr.
+        * 5. return superblock information
+        */
+
+       if (!dev->ops->get_bb_tbl)
+               return -EINVAL;
+
+       nvm_setup_sysblk_scan(dev, &s, sysblk_ppas);
+
+       mutex_lock(&dev->mlock);
+       ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, sysblk_get_host_blks);
+       if (ret)
+               goto err_sysblk;
+
+       /* no sysblocks initialized */
+       if (!s.nr_ppas)
+               goto err_sysblk;
+
+       cur = kzalloc(sizeof(struct nvm_system_block), GFP_KERNEL);
+       if (!cur)
+               goto err_sysblk;
+
+       /* find the latest block across all sysblocks */
+       for (i = 0; i < s.nr_rows; i++) {
+               for (j = 0; j < MAX_BLKS_PR_SYSBLK; j++) {
+                       struct ppa_addr ppa = s.ppas[scan_ppa_idx(i, j)];
+
+                       ret = nvm_scan_block(dev, &ppa, cur);
+                       if (ret > 0)
+                               found = 1;
+                       else if (ret < 0)
+                               break;
+               }
+       }
+
+       nvm_sysblk_to_cpu(info, cur);
+
+       kfree(cur);
+err_sysblk:
+       mutex_unlock(&dev->mlock);
+
+       if (found)
+               return 1;
+       return ret;
+}
+
+int nvm_update_sysblock(struct nvm_dev *dev, struct nvm_sb_info *new)
+{
+       /* 1. for each latest superblock
+        * 2. if room
+        *    a. write new flash page entry with the updated information
+        * 3. if no room
+        *    a. find next available block on lun (linear search)
+        *       if none, continue to next lun
+        *       if none at all, report error. also report that it wasn't
+        *       possible to write to all superblocks.
+        *    c. write data to block.
+        */
+       struct ppa_addr sysblk_ppas[MAX_SYSBLKS];
+       struct sysblk_scan s;
+       struct nvm_system_block *cur;
+       int i, j, ppaidx, found = 0;
+       int ret = -ENOMEM;
+
+       if (!dev->ops->get_bb_tbl)
+               return -EINVAL;
+
+       nvm_setup_sysblk_scan(dev, &s, sysblk_ppas);
+
+       mutex_lock(&dev->mlock);
+       ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, sysblk_get_host_blks);
+       if (ret)
+               goto err_sysblk;
+
+       cur = kzalloc(sizeof(struct nvm_system_block), GFP_KERNEL);
+       if (!cur)
+               goto err_sysblk;
+
+       /* Get the latest sysblk for each sysblk row */
+       for (i = 0; i < s.nr_rows; i++) {
+               found = 0;
+               for (j = 0; j < MAX_BLKS_PR_SYSBLK; j++) {
+                       ppaidx = scan_ppa_idx(i, j);
+                       ret = nvm_scan_block(dev, &s.ppas[ppaidx], cur);
+                       if (ret > 0) {
+                               s.act_blk[i] = j;
+                               found = 1;
+                       } else if (ret < 0)
+                               break;
+               }
+       }
+
+       if (!found) {
+               pr_err("nvm: no valid sysblks found to update\n");
+               ret = -EINVAL;
+               goto err_cur;
+       }
+
+       /*
+        * All sysblocks found. Check that they have same page id in their flash
+        * blocks
+        */
+       for (i = 1; i < s.nr_rows; i++) {
+               struct ppa_addr l = s.ppas[scan_ppa_idx(0, s.act_blk[0])];
+               struct ppa_addr r = s.ppas[scan_ppa_idx(i, s.act_blk[i])];
+
+               if (l.g.pg != r.g.pg) {
+                       pr_err("nvm: sysblks not on same page. Previous update failed.\n");
+                       ret = -EINVAL;
+                       goto err_cur;
+               }
+       }
+
+       /*
+        * Check that there haven't been another update to the seqnr since we
+        * began
+        */
+       if ((new->seqnr - 1) != be32_to_cpu(cur->seqnr)) {
+               pr_err("nvm: seq is not sequential\n");
+               ret = -EINVAL;
+               goto err_cur;
+       }
+
+       /*
+        * When all pages in a block has been written, a new block is selected
+        * and writing is performed on the new block.
+        */
+       if (s.ppas[scan_ppa_idx(0, s.act_blk[0])].g.pg ==
+                                               dev->lps_per_blk - 1) {
+               ret = nvm_prepare_new_sysblks(dev, &s);
+               if (ret)
+                       goto err_cur;
+       }
+
+       ret = nvm_write_and_verify(dev, new, &s);
+err_cur:
+       kfree(cur);
+err_sysblk:
+       mutex_unlock(&dev->mlock);
+
+       return ret;
+}
+
+int nvm_init_sysblock(struct nvm_dev *dev, struct nvm_sb_info *info)
+{
+       struct ppa_addr sysblk_ppas[MAX_SYSBLKS];
+       struct sysblk_scan s;
+       int ret;
+
+       /*
+        * 1. select master blocks and select first available blks
+        * 2. get bad block list
+        * 3. mark MAX_SYSBLKS block as host-based device allocated.
+        * 4. write and verify data to block
+        */
+
+       if (!dev->ops->get_bb_tbl || !dev->ops->set_bb_tbl)
+               return -EINVAL;
+
+       if (!(dev->mccap & NVM_ID_CAP_SLC) || !dev->lps_per_blk) {
+               pr_err("nvm: memory does not support SLC access\n");
+               return -EINVAL;
+       }
+
+       /* Index all sysblocks and mark them as host-driven */
+       nvm_setup_sysblk_scan(dev, &s, sysblk_ppas);
+
+       mutex_lock(&dev->mlock);
+       ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, sysblk_get_free_blks);
+       if (ret)
+               goto err_mark;
+
+       ret = nvm_set_bb_tbl(dev, &s, NVM_BLK_T_HOST);
+       if (ret)
+               goto err_mark;
+
+       /* Write to the first block of each row */
+       ret = nvm_write_and_verify(dev, info, &s);
+err_mark:
+       mutex_unlock(&dev->mlock);
+       return ret;
+}
+
+struct factory_blks {
+       struct nvm_dev *dev;
+       int flags;
+       unsigned long *blks;
+};
+
+static int factory_nblks(int nblks)
+{
+       /* Round up to nearest BITS_PER_LONG */
+       return (nblks + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
+}
+
+static unsigned int factory_blk_offset(struct nvm_dev *dev, int ch, int lun)
+{
+       int nblks = factory_nblks(dev->blks_per_lun);
+
+       return ((ch * dev->luns_per_chnl * nblks) + (lun * nblks)) /
+                                                               BITS_PER_LONG;
+}
+
+static int nvm_factory_blks(struct ppa_addr ppa, int nr_blks, u8 *blks,
+                                                               void *private)
+{
+       struct factory_blks *f = private;
+       struct nvm_dev *dev = f->dev;
+       int i, lunoff;
+
+       lunoff = factory_blk_offset(dev, ppa.g.ch, ppa.g.lun);
+
+       /* non-set bits correspond to the block must be erased */
+       for (i = 0; i < nr_blks; i++) {
+               switch (blks[i]) {
+               case NVM_BLK_T_FREE:
+                       if (f->flags & NVM_FACTORY_ERASE_ONLY_USER)
+                               set_bit(i, &f->blks[lunoff]);
+                       break;
+               case NVM_BLK_T_HOST:
+                       if (!(f->flags & NVM_FACTORY_RESET_HOST_BLKS))
+                               set_bit(i, &f->blks[lunoff]);
+                       break;
+               case NVM_BLK_T_GRWN_BAD:
+                       if (!(f->flags & NVM_FACTORY_RESET_GRWN_BBLKS))
+                               set_bit(i, &f->blks[lunoff]);
+                       break;
+               default:
+                       set_bit(i, &f->blks[lunoff]);
+                       break;
+               }
+       }
+
+       return 0;
+}
+
+static int nvm_fact_get_blks(struct nvm_dev *dev, struct ppa_addr *erase_list,
+                                       int max_ppas, struct factory_blks *f)
+{
+       struct ppa_addr ppa;
+       int ch, lun, blkid, idx, done = 0, ppa_cnt = 0;
+       unsigned long *offset;
+
+       while (!done) {
+               done = 1;
+               for (ch = 0; ch < dev->nr_chnls; ch++) {
+                       for (lun = 0; lun < dev->luns_per_chnl; lun++) {
+                               idx = factory_blk_offset(dev, ch, lun);
+                               offset = &f->blks[idx];
+
+                               blkid = find_first_zero_bit(offset,
+                                                       dev->blks_per_lun);
+                               if (blkid >= dev->blks_per_lun)
+                                       continue;
+                               set_bit(blkid, offset);
+
+                               ppa.ppa = 0;
+                               ppa.g.ch = ch;
+                               ppa.g.lun = lun;
+                               ppa.g.blk = blkid;
+                               pr_debug("nvm: erase ppa (%u %u %u)\n",
+                                                               ppa.g.ch,
+                                                               ppa.g.lun,
+                                                               ppa.g.blk);
+
+                               erase_list[ppa_cnt] = ppa;
+                               ppa_cnt++;
+                               done = 0;
+
+                               if (ppa_cnt == max_ppas)
+                                       return ppa_cnt;
+                       }
+               }
+       }
+
+       return ppa_cnt;
+}
+
+static int nvm_fact_get_bb_tbl(struct nvm_dev *dev, struct ppa_addr ppa,
+                                       nvm_bb_update_fn *fn, void *priv)
+{
+       struct ppa_addr dev_ppa;
+       int ret;
+
+       dev_ppa = generic_to_dev_addr(dev, ppa);
+
+       ret = dev->ops->get_bb_tbl(dev, dev_ppa, dev->blks_per_lun, fn, priv);
+       if (ret)
+               pr_err("nvm: failed bb tbl for ch%u lun%u\n",
+                                                       ppa.g.ch, ppa.g.blk);
+       return ret;
+}
+
+static int nvm_fact_select_blks(struct nvm_dev *dev, struct factory_blks *f)
+{
+       int ch, lun, ret;
+       struct ppa_addr ppa;
+
+       ppa.ppa = 0;
+       for (ch = 0; ch < dev->nr_chnls; ch++) {
+               for (lun = 0; lun < dev->luns_per_chnl; lun++) {
+                       ppa.g.ch = ch;
+                       ppa.g.lun = lun;
+
+                       ret = nvm_fact_get_bb_tbl(dev, ppa, nvm_factory_blks,
+                                                                       f);
+                       if (ret)
+                               return ret;
+               }
+       }
+
+       return 0;
+}
+
+int nvm_dev_factory(struct nvm_dev *dev, int flags)
+{
+       struct factory_blks f;
+       struct ppa_addr *ppas;
+       int ppa_cnt, ret = -ENOMEM;
+       int max_ppas = dev->ops->max_phys_sect / dev->nr_planes;
+       struct ppa_addr sysblk_ppas[MAX_SYSBLKS];
+       struct sysblk_scan s;
+
+       f.blks = kzalloc(factory_nblks(dev->blks_per_lun) * dev->nr_luns,
+                                                               GFP_KERNEL);
+       if (!f.blks)
+               return ret;
+
+       ppas = kcalloc(max_ppas, sizeof(struct ppa_addr), GFP_KERNEL);
+       if (!ppas)
+               goto err_blks;
+
+       f.dev = dev;
+       f.flags = flags;
+
+       /* create list of blks to be erased */
+       ret = nvm_fact_select_blks(dev, &f);
+       if (ret)
+               goto err_ppas;
+
+       /* continue to erase until list of blks until empty */
+       while ((ppa_cnt = nvm_fact_get_blks(dev, ppas, max_ppas, &f)) > 0)
+               nvm_erase_ppa(dev, ppas, ppa_cnt);
+
+       /* mark host reserved blocks free */
+       if (flags & NVM_FACTORY_RESET_HOST_BLKS) {
+               nvm_setup_sysblk_scan(dev, &s, sysblk_ppas);
+               mutex_lock(&dev->mlock);
+               ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas,
+                                                       sysblk_get_host_blks);
+               if (!ret)
+                       ret = nvm_set_bb_tbl(dev, &s, NVM_BLK_T_FREE);
+               mutex_unlock(&dev->mlock);
+       }
+err_ppas:
+       kfree(ppas);
+err_blks:
+       kfree(f.blks);
+       return ret;
+}
+EXPORT_SYMBOL(nvm_dev_factory);
index 83392f856dfd5d86e67cc636ab61e0fb0f16750d..22b9e34ceb75c5318a25870bf346a27ba2e64052 100644 (file)
@@ -1741,6 +1741,7 @@ static void bch_btree_gc(struct cache_set *c)
        do {
                ret = btree_root(gc_root, c, &op, &writes, &stats);
                closure_sync(&writes);
+               cond_resched();
 
                if (ret && ret != -EAGAIN)
                        pr_warn("gc failed!");
@@ -2162,8 +2163,10 @@ int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
                rw_lock(true, b, b->level);
 
                if (b->key.ptr[0] != btree_ptr ||
-                   b->seq != seq + 1)
+                   b->seq != seq + 1) {
+                       op->lock = b->level;
                        goto out;
+               }
        }
 
        SET_KEY_PTRS(check_key, 1);
index 679a093a3bf6382a3edc89dc5be96e9019b938ea..8d0ead98eb6edc2c25eafad5f8b4ab0a49e91e2d 100644 (file)
@@ -685,6 +685,8 @@ static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
        WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
             sysfs_create_link(&c->kobj, &d->kobj, d->name),
             "Couldn't create device <-> cache set symlinks");
+
+       clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags);
 }
 
 static void bcache_device_detach(struct bcache_device *d)
@@ -847,8 +849,11 @@ void bch_cached_dev_run(struct cached_dev *dc)
        buf[SB_LABEL_SIZE] = '\0';
        env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
 
-       if (atomic_xchg(&dc->running, 1))
+       if (atomic_xchg(&dc->running, 1)) {
+               kfree(env[1]);
+               kfree(env[2]);
                return;
+       }
 
        if (!d->c &&
            BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
@@ -1933,6 +1938,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
                        else
                                err = "device busy";
                        mutex_unlock(&bch_register_lock);
+                       if (attr == &ksysfs_register_quiet)
+                               goto out;
                }
                goto err;
        }
@@ -1971,8 +1978,7 @@ out:
 err_close:
        blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 err:
-       if (attr != &ksysfs_register_quiet)
-               pr_info("error opening %s: %s", path, err);
+       pr_info("error opening %s: %s", path, err);
        ret = -EINVAL;
        goto out;
 }
@@ -2066,8 +2072,10 @@ static int __init bcache_init(void)
        closure_debug_init();
 
        bcache_major = register_blkdev(0, "bcache");
-       if (bcache_major < 0)
+       if (bcache_major < 0) {
+               unregister_reboot_notifier(&reboot);
                return bcache_major;
+       }
 
        if (!(bcache_wq = create_workqueue("bcache")) ||
            !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
index b23f88d9f18cf1caa863b1b4ed56c37901296ece..b9346cd9cda192bdf9142bfa462dc0ab74383707 100644 (file)
@@ -323,6 +323,10 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
 
 static bool dirty_pred(struct keybuf *buf, struct bkey *k)
 {
+       struct cached_dev *dc = container_of(buf, struct cached_dev, writeback_keys);
+
+       BUG_ON(KEY_INODE(k) != dc->disk.id);
+
        return KEY_DIRTY(k);
 }
 
@@ -372,11 +376,24 @@ next:
        }
 }
 
+/*
+ * Returns true if we scanned the entire disk
+ */
 static bool refill_dirty(struct cached_dev *dc)
 {
        struct keybuf *buf = &dc->writeback_keys;
+       struct bkey start = KEY(dc->disk.id, 0, 0);
        struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0);
-       bool searched_from_start = false;
+       struct bkey start_pos;
+
+       /*
+        * make sure keybuf pos is inside the range for this disk - at bringup
+        * we might not be attached yet so this disk's inode nr isn't
+        * initialized then
+        */
+       if (bkey_cmp(&buf->last_scanned, &start) < 0 ||
+           bkey_cmp(&buf->last_scanned, &end) > 0)
+               buf->last_scanned = start;
 
        if (dc->partial_stripes_expensive) {
                refill_full_stripes(dc);
@@ -384,14 +401,20 @@ static bool refill_dirty(struct cached_dev *dc)
                        return false;
        }
 
-       if (bkey_cmp(&buf->last_scanned, &end) >= 0) {
-               buf->last_scanned = KEY(dc->disk.id, 0, 0);
-               searched_from_start = true;
-       }
-
+       start_pos = buf->last_scanned;
        bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
 
-       return bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start;
+       if (bkey_cmp(&buf->last_scanned, &end) < 0)
+               return false;
+
+       /*
+        * If we get to the end start scanning again from the beginning, and
+        * only scan up to where we initially started scanning from:
+        */
+       buf->last_scanned = start;
+       bch_refill_keybuf(dc->disk.c, buf, &start_pos, dirty_pred);
+
+       return bkey_cmp(&buf->last_scanned, &start_pos) >= 0;
 }
 
 static int bch_writeback_thread(void *arg)
index 0a9dab187b79c7ef0a4429c4616a6985d320b964..073a042aed243b2660f6b70a380d0647c709aa65 100644 (file)
@@ -63,7 +63,8 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 
 static inline void bch_writeback_queue(struct cached_dev *dc)
 {
-       wake_up_process(dc->writeback_thread);
+       if (!IS_ERR_OR_NULL(dc->writeback_thread))
+               wake_up_process(dc->writeback_thread);
 }
 
 static inline void bch_writeback_add(struct cached_dev *dc)
index 154aced0b91b63e33cb95a9ba1a82e386395392d..65cc0ac9b82d2b9e1eb928131c7a1dea074f2d02 100644 (file)
@@ -170,7 +170,7 @@ static int mmc_ios_show(struct seq_file *s, void *data)
                str = "invalid";
                break;
        }
-       seq_printf(s, "signal voltage:\t%u (%s)\n", ios->chip_select, str);
+       seq_printf(s, "signal voltage:\t%u (%s)\n", ios->signal_voltage, str);
 
        switch (ios->drv_type) {
        case MMC_SET_DRIVER_TYPE_A:
index 2b16263458af001723636f7770bbf4906b16b07c..aba786daebcadd9e193ffabe71e3a0124dd8ec62 100644 (file)
@@ -29,15 +29,18 @@ struct mmc_pwrseq_simple {
 static void mmc_pwrseq_simple_set_gpios_value(struct mmc_pwrseq_simple *pwrseq,
                                              int value)
 {
-       int i;
        struct gpio_descs *reset_gpios = pwrseq->reset_gpios;
-       int values[reset_gpios->ndescs];
 
-       for (i = 0; i < reset_gpios->ndescs; i++)
-               values[i] = value;
+       if (!IS_ERR(reset_gpios)) {
+               int i;
+               int values[reset_gpios->ndescs];
 
-       gpiod_set_array_value_cansleep(reset_gpios->ndescs, reset_gpios->desc,
-                                      values);
+               for (i = 0; i < reset_gpios->ndescs; i++)
+                       values[i] = value;
+
+               gpiod_set_array_value_cansleep(
+                       reset_gpios->ndescs, reset_gpios->desc, values);
+       }
 }
 
 static void mmc_pwrseq_simple_pre_power_on(struct mmc_host *host)
@@ -79,7 +82,8 @@ static void mmc_pwrseq_simple_free(struct mmc_host *host)
        struct mmc_pwrseq_simple *pwrseq = container_of(host->pwrseq,
                                        struct mmc_pwrseq_simple, pwrseq);
 
-       gpiod_put_array(pwrseq->reset_gpios);
+       if (!IS_ERR(pwrseq->reset_gpios))
+               gpiod_put_array(pwrseq->reset_gpios);
 
        if (!IS_ERR(pwrseq->ext_clk))
                clk_put(pwrseq->ext_clk);
@@ -112,7 +116,9 @@ struct mmc_pwrseq *mmc_pwrseq_simple_alloc(struct mmc_host *host,
        }
 
        pwrseq->reset_gpios = gpiod_get_array(dev, "reset", GPIOD_OUT_HIGH);
-       if (IS_ERR(pwrseq->reset_gpios)) {
+       if (IS_ERR(pwrseq->reset_gpios) &&
+           PTR_ERR(pwrseq->reset_gpios) != -ENOENT &&
+           PTR_ERR(pwrseq->reset_gpios) != -ENOSYS) {
                ret = PTR_ERR(pwrseq->reset_gpios);
                goto clk_put;
        }
index f2b164b214ae289dd313a0945cbbd5dd7e1dc532..bb39a29b2db68d72c5e34d7e408cce0105c110a7 100644 (file)
@@ -329,6 +329,7 @@ static int mmc_read_switch(struct mmc_card *card)
                card->sw_caps.sd3_bus_mode = status[13];
                /* Driver Strengths supported by the card */
                card->sw_caps.sd3_drv_type = status[9];
+               card->sw_caps.sd3_curr_limit = status[7] | status[6] << 8;
        }
 
 out:
@@ -545,14 +546,25 @@ static int sd_set_current_limit(struct mmc_card *card, u8 *status)
         * when we set current limit to 200ma, the card will draw 200ma, and
         * when we set current limit to 400/600/800ma, the card will draw its
         * maximum 300ma from the host.
+        *
+        * The above is incorrect: if we try to set a current limit that is
+        * not supported by the card, the card can rightfully error out the
+        * attempt, and remain at the default current limit.  This results
+        * in a 300mA card being limited to 200mA even though the host
+        * supports 800mA. Failures seen with SanDisk 8GB UHS cards with
+        * an iMX6 host. --rmk
         */
-       if (max_current >= 800)
+       if (max_current >= 800 &&
+           card->sw_caps.sd3_curr_limit & SD_MAX_CURRENT_800)
                current_limit = SD_SET_CURRENT_LIMIT_800;
-       else if (max_current >= 600)
+       else if (max_current >= 600 &&
+                card->sw_caps.sd3_curr_limit & SD_MAX_CURRENT_600)
                current_limit = SD_SET_CURRENT_LIMIT_600;
-       else if (max_current >= 400)
+       else if (max_current >= 400 &&
+                card->sw_caps.sd3_curr_limit & SD_MAX_CURRENT_400)
                current_limit = SD_SET_CURRENT_LIMIT_400;
-       else if (max_current >= 200)
+       else if (max_current >= 200 &&
+                card->sw_caps.sd3_curr_limit & SD_MAX_CURRENT_200)
                current_limit = SD_SET_CURRENT_LIMIT_200;
 
        if (current_limit != SD_SET_CURRENT_NO_CHANGE) {
@@ -626,9 +638,9 @@ static int mmc_sd_init_uhs_card(struct mmc_card *card)
         * SDR104 mode SD-cards. Note that tuning is mandatory for SDR104.
         */
        if (!mmc_host_is_spi(card->host) &&
-               (card->sd_bus_speed == UHS_SDR50_BUS_SPEED ||
-                card->sd_bus_speed == UHS_DDR50_BUS_SPEED ||
-                card->sd_bus_speed == UHS_SDR104_BUS_SPEED)) {
+               (card->host->ios.timing == MMC_TIMING_UHS_SDR50 ||
+                card->host->ios.timing == MMC_TIMING_UHS_DDR50 ||
+                card->host->ios.timing == MMC_TIMING_UHS_SDR104)) {
                err = mmc_execute_tuning(card);
 
                /*
@@ -638,7 +650,7 @@ static int mmc_sd_init_uhs_card(struct mmc_card *card)
                 * difference between v3.00 and 3.01 spec means that CMD19
                 * tuning is also available for DDR50 mode.
                 */
-               if (err && card->sd_bus_speed == UHS_DDR50_BUS_SPEED) {
+               if (err && card->host->ios.timing == MMC_TIMING_UHS_DDR50) {
                        pr_warn("%s: ddr50 tuning failed\n",
                                mmc_hostname(card->host));
                        err = 0;
index d61ba1a0495ebe828ba8c110da3d9053b506079b..467b3cf80c44549c704080d78540c46599eb6882 100644 (file)
@@ -535,8 +535,8 @@ static int mmc_sdio_init_uhs_card(struct mmc_card *card)
         * SDR104 mode SD-cards. Note that tuning is mandatory for SDR104.
         */
        if (!mmc_host_is_spi(card->host) &&
-           ((card->sw_caps.sd3_bus_mode & SD_MODE_UHS_SDR50) ||
-            (card->sw_caps.sd3_bus_mode & SD_MODE_UHS_SDR104)))
+           ((card->host->ios.timing == MMC_TIMING_UHS_SDR50) ||
+             (card->host->ios.timing == MMC_TIMING_UHS_SDR104)))
                err = mmc_execute_tuning(card);
 out:
        return err;
index 8e94e555b788d4bc56364c488f1af0894e4db50a..6f6fc527a263384817316afe478803634429633a 100644 (file)
@@ -223,6 +223,7 @@ static const struct cis_tpl cis_tpl_list[] = {
        {       0x20,   4,      cistpl_manfid           },
        {       0x21,   2,      /* cistpl_funcid */     },
        {       0x22,   0,      cistpl_funce            },
+       {       0x91,   2,      /* cistpl_sdio_std */   },
 };
 
 static int sdio_read_cis(struct mmc_card *card, struct sdio_func *func)
index fb266745f8240d603956b8b791370ac4764f5a23..0d6ca4116f3dca7fe3170e53809e5e300193cca0 100644 (file)
@@ -151,6 +151,7 @@ static struct variant_data variant_nomadik = {
        .fifosize               = 16 * 4,
        .fifohalfsize           = 8 * 4,
        .clkreg                 = MCI_CLK_ENABLE,
+       .clkreg_8bit_bus_enable = MCI_ST_8BIT_BUS,
        .datalength_bits        = 24,
        .datactrl_mask_sdio     = MCI_ST_DPSM_SDIOEN,
        .st_sdio                = true,
@@ -1886,7 +1887,7 @@ static struct amba_id mmci_ids[] = {
        {
                .id     = 0x00280180,
                .mask   = 0x00ffffff,
-               .data   = &variant_u300,
+               .data   = &variant_nomadik,
        },
        {
                .id     = 0x00480180,
index e4b05dbb9ca822f003f566d07a60c508721bc68c..4a0d6b80eaa374e7749081c7f966ce85379ad3e2 100644 (file)
@@ -94,9 +94,9 @@ static void tmio_mmc_start_dma_rx(struct tmio_mmc_host *host)
                        desc = NULL;
                        ret = cookie;
                }
+               dev_dbg(&host->pdev->dev, "%s(): mapped %d -> %d, cookie %d, rq %p\n",
+                       __func__, host->sg_len, ret, cookie, host->mrq);
        }
-       dev_dbg(&host->pdev->dev, "%s(): mapped %d -> %d, cookie %d, rq %p\n",
-               __func__, host->sg_len, ret, cookie, host->mrq);
 
 pio:
        if (!desc) {
@@ -116,8 +116,8 @@ pio:
                         "DMA failed: %d, falling back to PIO\n", ret);
        }
 
-       dev_dbg(&host->pdev->dev, "%s(): desc %p, cookie %d, sg[%d]\n", __func__,
-               desc, cookie, host->sg_len);
+       dev_dbg(&host->pdev->dev, "%s(): desc %p, sg[%d]\n", __func__,
+               desc, host->sg_len);
 }
 
 static void tmio_mmc_start_dma_tx(struct tmio_mmc_host *host)
@@ -174,9 +174,9 @@ static void tmio_mmc_start_dma_tx(struct tmio_mmc_host *host)
                        desc = NULL;
                        ret = cookie;
                }
+               dev_dbg(&host->pdev->dev, "%s(): mapped %d -> %d, cookie %d, rq %p\n",
+                       __func__, host->sg_len, ret, cookie, host->mrq);
        }
-       dev_dbg(&host->pdev->dev, "%s(): mapped %d -> %d, cookie %d, rq %p\n",
-               __func__, host->sg_len, ret, cookie, host->mrq);
 
 pio:
        if (!desc) {
@@ -196,8 +196,7 @@ pio:
                         "DMA failed: %d, falling back to PIO\n", ret);
        }
 
-       dev_dbg(&host->pdev->dev, "%s(): desc %p, cookie %d\n", __func__,
-               desc, cookie);
+       dev_dbg(&host->pdev->dev, "%s(): desc %p\n", __func__, desc);
 }
 
 void tmio_mmc_start_dma(struct tmio_mmc_host *host,
index 54e056d3be0250910f464732bacc13bfd7eee50b..ee2b74d1d1b5e388a7d60880e5ef0e1f07676e70 100644 (file)
@@ -174,9 +174,9 @@ static int vol_cdev_fsync(struct file *file, loff_t start, loff_t end,
        struct ubi_device *ubi = desc->vol->ubi;
        struct inode *inode = file_inode(file);
        int err;
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        err = ubi_sync(ubi->ubi_num);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return err;
 }
 
index 2c2baab9d88044d77d910653f882e9815fa57561..d66c690a859773d24db2cc835f8c365135023bf4 100644 (file)
@@ -157,6 +157,7 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags)
                [29] = "802.1ad offload support",
                [31] = "Modifying loopback source checks using UPDATE_QP support",
                [32] = "Loopback source checks support",
+               [33] = "RoCEv2 support"
        };
        int i;
 
@@ -626,6 +627,8 @@ out:
        return err;
 }
 
+static void disable_unsupported_roce_caps(void *buf);
+
 int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 {
        struct mlx4_cmd_mailbox *mailbox;
@@ -738,6 +741,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
        if (err)
                goto out;
 
+       if (mlx4_is_mfunc(dev))
+               disable_unsupported_roce_caps(outbox);
        MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_QP_OFFSET);
        dev_cap->reserved_qps = 1 << (field & 0xf);
        MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_OFFSET);
@@ -905,6 +910,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
                dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_EQE_STRIDE;
        MLX4_GET(dev_cap->bmme_flags, outbox,
                 QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
+       if (dev_cap->bmme_flags & MLX4_FLAG_ROCE_V1_V2)
+               dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_ROCE_V1_V2;
        if (dev_cap->bmme_flags & MLX4_FLAG_PORT_REMAP)
                dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_PORT_REMAP;
        MLX4_GET(field, outbox, QUERY_DEV_CAP_CONFIG_DEV_OFFSET);
@@ -1161,6 +1168,7 @@ int mlx4_QUERY_DEV_CAP_wrapper(struct mlx4_dev *dev, int slave,
        if (err)
                return err;
 
+       disable_unsupported_roce_caps(outbox->buf);
        /* add port mng change event capability and disable mw type 1
         * unconditionally to slaves
         */
@@ -1258,6 +1266,21 @@ int mlx4_QUERY_DEV_CAP_wrapper(struct mlx4_dev *dev, int slave,
        return 0;
 }
 
+static void disable_unsupported_roce_caps(void *buf)
+{
+       u32 flags;
+
+       MLX4_GET(flags, buf, QUERY_DEV_CAP_EXT_FLAGS_OFFSET);
+       flags &= ~(1UL << 31);
+       MLX4_PUT(buf, flags, QUERY_DEV_CAP_EXT_FLAGS_OFFSET);
+       MLX4_GET(flags, buf, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET);
+       flags &= ~(1UL << 24);
+       MLX4_PUT(buf, flags, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET);
+       MLX4_GET(flags, buf, QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
+       flags &= ~(MLX4_FLAG_ROCE_V1_V2);
+       MLX4_PUT(buf, flags, QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
+}
+
 int mlx4_QUERY_PORT_wrapper(struct mlx4_dev *dev, int slave,
                            struct mlx4_vhcr *vhcr,
                            struct mlx4_cmd_mailbox *inbox,
@@ -2239,7 +2262,8 @@ struct mlx4_config_dev {
        __be32  rsvd1[3];
        __be16  vxlan_udp_dport;
        __be16  rsvd2;
-       __be32  rsvd3;
+       __be16  roce_v2_entropy;
+       __be16  roce_v2_udp_dport;
        __be32  roce_flags;
        __be32  rsvd4[25];
        __be16  rsvd5;
@@ -2248,6 +2272,7 @@ struct mlx4_config_dev {
 };
 
 #define MLX4_VXLAN_UDP_DPORT (1 << 0)
+#define MLX4_ROCE_V2_UDP_DPORT BIT(3)
 #define MLX4_DISABLE_RX_PORT BIT(18)
 
 static int mlx4_CONFIG_DEV_set(struct mlx4_dev *dev, struct mlx4_config_dev *config_dev)
@@ -2365,6 +2390,18 @@ int mlx4_disable_rx_port_check(struct mlx4_dev *dev, bool dis)
        return mlx4_CONFIG_DEV_set(dev, &config_dev);
 }
 
+int mlx4_config_roce_v2_port(struct mlx4_dev *dev, u16 udp_port)
+{
+       struct mlx4_config_dev config_dev;
+
+       memset(&config_dev, 0, sizeof(config_dev));
+       config_dev.update_flags    = cpu_to_be32(MLX4_ROCE_V2_UDP_DPORT);
+       config_dev.roce_v2_udp_dport = cpu_to_be16(udp_port);
+
+       return mlx4_CONFIG_DEV_set(dev, &config_dev);
+}
+EXPORT_SYMBOL_GPL(mlx4_config_roce_v2_port);
+
 int mlx4_virt2phy_port_map(struct mlx4_dev *dev, u32 port1, u32 port2)
 {
        struct mlx4_cmd_mailbox *mailbox;
index 2404c22ad2b2ac5d890798dc5bfe7ae608232e2c..7baef52db6b7586cf41df128c8ddecc581bc1568 100644 (file)
@@ -780,7 +780,10 @@ struct mlx4_set_port_general_context {
        u16 reserved1;
        u8 v_ignore_fcs;
        u8 flags;
-       u8 ignore_fcs;
+       union {
+               u8 ignore_fcs;
+               u8 roce_mode;
+       };
        u8 reserved2;
        __be16 mtu;
        u8 pptx;
index f2550425c25147e0881fdabfb20d8514cb37a088..787b7bb54d52acd094570283bf6793f1bfb0dab0 100644 (file)
@@ -1520,6 +1520,8 @@ int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port, int pkey_tbl_sz)
        return err;
 }
 
+#define SET_PORT_ROCE_2_FLAGS          0x10
+#define MLX4_SET_PORT_ROCE_V1_V2       0x2
 int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu,
                          u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx)
 {
@@ -1539,6 +1541,11 @@ int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu,
        context->pprx = (pprx * (!pfcrx)) << 7;
        context->pfcrx = pfcrx;
 
+       if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) {
+               context->flags |= SET_PORT_ROCE_2_FLAGS;
+               context->roce_mode |=
+                       MLX4_SET_PORT_ROCE_V1_V2 << 4;
+       }
        in_mod = MLX4_SET_PORT_GENERAL << 8 | port;
        err = mlx4_cmd(dev, mailbox->dma, in_mod, MLX4_SET_PORT_ETH_OPCODE,
                       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
index 168823dde79f3dd48596bdad5a4ea72a95ed0404..d1cd9c32a9ae83912e9b3b552c5680768b8449c3 100644 (file)
@@ -167,6 +167,12 @@ static int __mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
                context->log_page_size   = mtt->page_shift - MLX4_ICM_PAGE_SHIFT;
        }
 
+       if ((cur_state == MLX4_QP_STATE_RTR) &&
+           (new_state == MLX4_QP_STATE_RTS) &&
+           dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2)
+               context->roce_entropy =
+                       cpu_to_be16(mlx4_qp_roce_entropy(dev, qp->qpn));
+
        *(__be32 *) mailbox->buf = cpu_to_be32(optpar);
        memcpy(mailbox->buf + 8, context, sizeof *context);
 
@@ -921,3 +927,23 @@ int mlx4_qp_to_ready(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
        return 0;
 }
 EXPORT_SYMBOL_GPL(mlx4_qp_to_ready);
+
+u16 mlx4_qp_roce_entropy(struct mlx4_dev *dev, u32 qpn)
+{
+       struct mlx4_qp_context context;
+       struct mlx4_qp qp;
+       int err;
+
+       qp.qpn = qpn;
+       err = mlx4_qp_query(dev, &qp, &context);
+       if (!err) {
+               u32 dest_qpn = be32_to_cpu(context.remote_qpn) & 0xffffff;
+               u16 folded_dst = folded_qp(dest_qpn);
+               u16 folded_src = folded_qp(qpn);
+
+               return (dest_qpn != qpn) ?
+                       ((folded_dst ^ folded_src) | 0xC000) :
+                       folded_src | 0xC000;
+       }
+       return 0xdead;
+}
index 9ea49a8933231808fff014f5a4570b7e380f5b44..aac071a7e830b5fd777da7a5e4d014d802ad70d0 100644 (file)
@@ -39,8 +39,8 @@
 #include <linux/mlx5/qp.h>
 #include <linux/mlx5/cq.h>
 #include <linux/mlx5/vport.h>
+#include <linux/mlx5/transobj.h>
 #include "wq.h"
-#include "transobj.h"
 #include "mlx5_core.h"
 
 #define MLX5E_MAX_NUM_TC       8
index c56d91a2812b04bf0857be048a7fb11fc8b2c1d0..6a3e430f10624e637fb8b3dcfaade968d2a83c53 100644 (file)
@@ -2241,7 +2241,7 @@ static void *mlx5e_create_netdev(struct mlx5_core_dev *mdev)
                goto err_unmap_free_uar;
        }
 
-       err = mlx5_alloc_transport_domain(mdev, &priv->tdn);
+       err = mlx5_core_alloc_transport_domain(mdev, &priv->tdn);
        if (err) {
                mlx5_core_err(mdev, "alloc td failed, %d\n", err);
                goto err_dealloc_pd;
@@ -2324,7 +2324,7 @@ err_destroy_mkey:
        mlx5_core_destroy_mkey(mdev, &priv->mr);
 
 err_dealloc_transport_domain:
-       mlx5_dealloc_transport_domain(mdev, priv->tdn);
+       mlx5_core_dealloc_transport_domain(mdev, priv->tdn);
 
 err_dealloc_pd:
        mlx5_core_dealloc_pd(mdev, priv->pdn);
@@ -2356,7 +2356,7 @@ static void mlx5e_destroy_netdev(struct mlx5_core_dev *mdev, void *vpriv)
        mlx5e_close_drop_rq(priv);
        mlx5e_destroy_tises(priv);
        mlx5_core_destroy_mkey(priv->mdev, &priv->mr);
-       mlx5_dealloc_transport_domain(priv->mdev, priv->tdn);
+       mlx5_core_dealloc_transport_domain(priv->mdev, priv->tdn);
        mlx5_core_dealloc_pd(priv->mdev, priv->pdn);
        mlx5_unmap_free_uar(priv->mdev, &priv->cq_uar);
        free_netdev(netdev);
index 23c244a7e5d73d5fe0042f793d7ca955d945bf7b..647a3ca2c2a925c1d6157febb59fa51ea39872da 100644 (file)
@@ -230,6 +230,7 @@ static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
                case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
                case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
                        rsn = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
+                       rsn |= (eqe->data.qp_srq.type << MLX5_USER_INDEX_LEN);
                        mlx5_core_dbg(dev, "event %s(%d) arrived on resource 0x%x\n",
                                      eqe_type_str(eqe->type), eqe->type, rsn);
                        mlx5_rsc_event(dev, rsn, eqe->type);
index b37749a3730e576ef798d02ae380caad91d44fbf..1545a944c309bf3205ad49fc1de90bd21c3eaacd 100644 (file)
@@ -78,6 +78,11 @@ struct mlx5_device_context {
        void                   *context;
 };
 
+enum {
+       MLX5_ATOMIC_REQ_MODE_BE = 0x0,
+       MLX5_ATOMIC_REQ_MODE_HOST_ENDIANNESS = 0x1,
+};
+
 static struct mlx5_profile profile[] = {
        [0] = {
                .mask           = 0,
@@ -387,7 +392,7 @@ query_ex:
        return err;
 }
 
-static int set_caps(struct mlx5_core_dev *dev, void *in, int in_sz)
+static int set_caps(struct mlx5_core_dev *dev, void *in, int in_sz, int opmod)
 {
        u32 out[MLX5_ST_SZ_DW(set_hca_cap_out)];
        int err;
@@ -395,6 +400,7 @@ static int set_caps(struct mlx5_core_dev *dev, void *in, int in_sz)
        memset(out, 0, sizeof(out));
 
        MLX5_SET(set_hca_cap_in, in, opcode, MLX5_CMD_OP_SET_HCA_CAP);
+       MLX5_SET(set_hca_cap_in, in, op_mod, opmod << 1);
        err = mlx5_cmd_exec(dev, in, in_sz, out, sizeof(out));
        if (err)
                return err;
@@ -404,6 +410,46 @@ static int set_caps(struct mlx5_core_dev *dev, void *in, int in_sz)
        return err;
 }
 
+static int handle_hca_cap_atomic(struct mlx5_core_dev *dev)
+{
+       void *set_ctx;
+       void *set_hca_cap;
+       int set_sz = MLX5_ST_SZ_BYTES(set_hca_cap_in);
+       int req_endianness;
+       int err;
+
+       if (MLX5_CAP_GEN(dev, atomic)) {
+               err = mlx5_core_get_caps(dev, MLX5_CAP_ATOMIC,
+                                        HCA_CAP_OPMOD_GET_CUR);
+               if (err)
+                       return err;
+       } else {
+               return 0;
+       }
+
+       req_endianness =
+               MLX5_CAP_ATOMIC(dev,
+                               supported_atomic_req_8B_endianess_mode_1);
+
+       if (req_endianness != MLX5_ATOMIC_REQ_MODE_HOST_ENDIANNESS)
+               return 0;
+
+       set_ctx = kzalloc(set_sz, GFP_KERNEL);
+       if (!set_ctx)
+               return -ENOMEM;
+
+       set_hca_cap = MLX5_ADDR_OF(set_hca_cap_in, set_ctx, capability);
+
+       /* Set requestor to host endianness */
+       MLX5_SET(atomic_caps, set_hca_cap, atomic_req_8B_endianess_mode,
+                MLX5_ATOMIC_REQ_MODE_HOST_ENDIANNESS);
+
+       err = set_caps(dev, set_ctx, set_sz, MLX5_SET_HCA_CAP_OP_MOD_ATOMIC);
+
+       kfree(set_ctx);
+       return err;
+}
+
 static int handle_hca_cap(struct mlx5_core_dev *dev)
 {
        void *set_ctx = NULL;
@@ -445,7 +491,8 @@ static int handle_hca_cap(struct mlx5_core_dev *dev)
 
        MLX5_SET(cmd_hca_cap, set_hca_cap, log_uar_page_sz, PAGE_SHIFT - 12);
 
-       err = set_caps(dev, set_ctx, set_sz);
+       err = set_caps(dev, set_ctx, set_sz,
+                      MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE);
 
 query_ex:
        kfree(set_ctx);
@@ -667,7 +714,6 @@ clean:
        return err;
 }
 
-#ifdef CONFIG_MLX5_CORE_EN
 static int mlx5_core_set_issi(struct mlx5_core_dev *dev)
 {
        u32 query_in[MLX5_ST_SZ_DW(query_issi_in)];
@@ -720,7 +766,6 @@ static int mlx5_core_set_issi(struct mlx5_core_dev *dev)
 
        return -ENOTSUPP;
 }
-#endif
 
 static int map_bf_area(struct mlx5_core_dev *dev)
 {
@@ -966,13 +1011,11 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
                goto err_pagealloc_cleanup;
        }
 
-#ifdef CONFIG_MLX5_CORE_EN
        err = mlx5_core_set_issi(dev);
        if (err) {
                dev_err(&pdev->dev, "failed to set issi\n");
                goto err_disable_hca;
        }
-#endif
 
        err = mlx5_satisfy_startup_pages(dev, 1);
        if (err) {
@@ -992,6 +1035,12 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
                goto reclaim_boot_pages;
        }
 
+       err = handle_hca_cap_atomic(dev);
+       if (err) {
+               dev_err(&pdev->dev, "handle_hca_cap_atomic failed\n");
+               goto reclaim_boot_pages;
+       }
+
        err = mlx5_satisfy_startup_pages(dev, 0);
        if (err) {
                dev_err(&pdev->dev, "failed to allocate init pages\n");
index 30e2ba3f5f16cb871fd20b0d05bb99433ea22ab1..def289375ecbf1a1543dcbbd4cf04e0a9b03ce1b 100644 (file)
@@ -36,6 +36,7 @@
 #include <linux/mlx5/cmd.h>
 #include <linux/mlx5/qp.h>
 #include <linux/mlx5/driver.h>
+#include <linux/mlx5/transobj.h>
 
 #include "mlx5_core.h"
 
@@ -67,6 +68,52 @@ void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common)
                complete(&common->free);
 }
 
+static u64 qp_allowed_event_types(void)
+{
+       u64 mask;
+
+       mask = BIT(MLX5_EVENT_TYPE_PATH_MIG) |
+              BIT(MLX5_EVENT_TYPE_COMM_EST) |
+              BIT(MLX5_EVENT_TYPE_SQ_DRAINED) |
+              BIT(MLX5_EVENT_TYPE_SRQ_LAST_WQE) |
+              BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR) |
+              BIT(MLX5_EVENT_TYPE_PATH_MIG_FAILED) |
+              BIT(MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR) |
+              BIT(MLX5_EVENT_TYPE_WQ_ACCESS_ERROR);
+
+       return mask;
+}
+
+static u64 rq_allowed_event_types(void)
+{
+       u64 mask;
+
+       mask = BIT(MLX5_EVENT_TYPE_SRQ_LAST_WQE) |
+              BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR);
+
+       return mask;
+}
+
+static u64 sq_allowed_event_types(void)
+{
+       return BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR);
+}
+
+static bool is_event_type_allowed(int rsc_type, int event_type)
+{
+       switch (rsc_type) {
+       case MLX5_EVENT_QUEUE_TYPE_QP:
+               return BIT(event_type) & qp_allowed_event_types();
+       case MLX5_EVENT_QUEUE_TYPE_RQ:
+               return BIT(event_type) & rq_allowed_event_types();
+       case MLX5_EVENT_QUEUE_TYPE_SQ:
+               return BIT(event_type) & sq_allowed_event_types();
+       default:
+               WARN(1, "Event arrived for unknown resource type");
+               return false;
+       }
+}
+
 void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type)
 {
        struct mlx5_core_rsc_common *common = mlx5_get_rsc(dev, rsn);
@@ -75,8 +122,16 @@ void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type)
        if (!common)
                return;
 
+       if (!is_event_type_allowed((rsn >> MLX5_USER_INDEX_LEN), event_type)) {
+               mlx5_core_warn(dev, "event 0x%.2x is not allowed on resource 0x%.8x\n",
+                              event_type, rsn);
+               return;
+       }
+
        switch (common->res) {
        case MLX5_RES_QP:
+       case MLX5_RES_RQ:
+       case MLX5_RES_SQ:
                qp = (struct mlx5_core_qp *)common;
                qp->event(qp, event_type);
                break;
@@ -177,27 +232,56 @@ void mlx5_eq_pagefault(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe)
 }
 #endif
 
+static int create_qprqsq_common(struct mlx5_core_dev *dev,
+                               struct mlx5_core_qp *qp,
+                               int rsc_type)
+{
+       struct mlx5_qp_table *table = &dev->priv.qp_table;
+       int err;
+
+       qp->common.res = rsc_type;
+       spin_lock_irq(&table->lock);
+       err = radix_tree_insert(&table->tree,
+                               qp->qpn | (rsc_type << MLX5_USER_INDEX_LEN),
+                               qp);
+       spin_unlock_irq(&table->lock);
+       if (err)
+               return err;
+
+       atomic_set(&qp->common.refcount, 1);
+       init_completion(&qp->common.free);
+       qp->pid = current->pid;
+
+       return 0;
+}
+
+static void destroy_qprqsq_common(struct mlx5_core_dev *dev,
+                                 struct mlx5_core_qp *qp)
+{
+       struct mlx5_qp_table *table = &dev->priv.qp_table;
+       unsigned long flags;
+
+       spin_lock_irqsave(&table->lock, flags);
+       radix_tree_delete(&table->tree,
+                         qp->qpn | (qp->common.res << MLX5_USER_INDEX_LEN));
+       spin_unlock_irqrestore(&table->lock, flags);
+       mlx5_core_put_rsc((struct mlx5_core_rsc_common *)qp);
+       wait_for_completion(&qp->common.free);
+}
+
 int mlx5_core_create_qp(struct mlx5_core_dev *dev,
                        struct mlx5_core_qp *qp,
                        struct mlx5_create_qp_mbox_in *in,
                        int inlen)
 {
-       struct mlx5_qp_table *table = &dev->priv.qp_table;
        struct mlx5_create_qp_mbox_out out;
        struct mlx5_destroy_qp_mbox_in din;
        struct mlx5_destroy_qp_mbox_out dout;
        int err;
-       void *qpc;
 
        memset(&out, 0, sizeof(out));
        in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_QP);
 
-       if (dev->issi) {
-               qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
-               /* 0xffffff means we ask to work with cqe version 0 */
-               MLX5_SET(qpc, qpc, user_index, 0xffffff);
-       }
-
        err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out));
        if (err) {
                mlx5_core_warn(dev, "ret %d\n", err);
@@ -213,24 +297,16 @@ int mlx5_core_create_qp(struct mlx5_core_dev *dev,
        qp->qpn = be32_to_cpu(out.qpn) & 0xffffff;
        mlx5_core_dbg(dev, "qpn = 0x%x\n", qp->qpn);
 
-       qp->common.res = MLX5_RES_QP;
-       spin_lock_irq(&table->lock);
-       err = radix_tree_insert(&table->tree, qp->qpn, qp);
-       spin_unlock_irq(&table->lock);
-       if (err) {
-               mlx5_core_warn(dev, "err %d\n", err);
+       err = create_qprqsq_common(dev, qp, MLX5_RES_QP);
+       if (err)
                goto err_cmd;
-       }
 
        err = mlx5_debug_qp_add(dev, qp);
        if (err)
                mlx5_core_dbg(dev, "failed adding QP 0x%x to debug file system\n",
                              qp->qpn);
 
-       qp->pid = current->pid;
-       atomic_set(&qp->common.refcount, 1);
        atomic_inc(&dev->num_qps);
-       init_completion(&qp->common.free);
 
        return 0;
 
@@ -250,18 +326,11 @@ int mlx5_core_destroy_qp(struct mlx5_core_dev *dev,
 {
        struct mlx5_destroy_qp_mbox_in in;
        struct mlx5_destroy_qp_mbox_out out;
-       struct mlx5_qp_table *table = &dev->priv.qp_table;
-       unsigned long flags;
        int err;
 
        mlx5_debug_qp_remove(dev, qp);
 
-       spin_lock_irqsave(&table->lock, flags);
-       radix_tree_delete(&table->tree, qp->qpn);
-       spin_unlock_irqrestore(&table->lock, flags);
-
-       mlx5_core_put_rsc((struct mlx5_core_rsc_common *)qp);
-       wait_for_completion(&qp->common.free);
+       destroy_qprqsq_common(dev, qp);
 
        memset(&in, 0, sizeof(in));
        memset(&out, 0, sizeof(out));
@@ -279,59 +348,15 @@ int mlx5_core_destroy_qp(struct mlx5_core_dev *dev,
 }
 EXPORT_SYMBOL_GPL(mlx5_core_destroy_qp);
 
-int mlx5_core_qp_modify(struct mlx5_core_dev *dev, enum mlx5_qp_state cur_state,
-                       enum mlx5_qp_state new_state,
+int mlx5_core_qp_modify(struct mlx5_core_dev *dev, u16 operation,
                        struct mlx5_modify_qp_mbox_in *in, int sqd_event,
                        struct mlx5_core_qp *qp)
 {
-       static const u16 optab[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE] = {
-               [MLX5_QP_STATE_RST] = {
-                       [MLX5_QP_STATE_RST]     = MLX5_CMD_OP_2RST_QP,
-                       [MLX5_QP_STATE_ERR]     = MLX5_CMD_OP_2ERR_QP,
-                       [MLX5_QP_STATE_INIT]    = MLX5_CMD_OP_RST2INIT_QP,
-               },
-               [MLX5_QP_STATE_INIT]  = {
-                       [MLX5_QP_STATE_RST]     = MLX5_CMD_OP_2RST_QP,
-                       [MLX5_QP_STATE_ERR]     = MLX5_CMD_OP_2ERR_QP,
-                       [MLX5_QP_STATE_INIT]    = MLX5_CMD_OP_INIT2INIT_QP,
-                       [MLX5_QP_STATE_RTR]     = MLX5_CMD_OP_INIT2RTR_QP,
-               },
-               [MLX5_QP_STATE_RTR]   = {
-                       [MLX5_QP_STATE_RST]     = MLX5_CMD_OP_2RST_QP,
-                       [MLX5_QP_STATE_ERR]     = MLX5_CMD_OP_2ERR_QP,
-                       [MLX5_QP_STATE_RTS]     = MLX5_CMD_OP_RTR2RTS_QP,
-               },
-               [MLX5_QP_STATE_RTS]   = {
-                       [MLX5_QP_STATE_RST]     = MLX5_CMD_OP_2RST_QP,
-                       [MLX5_QP_STATE_ERR]     = MLX5_CMD_OP_2ERR_QP,
-                       [MLX5_QP_STATE_RTS]     = MLX5_CMD_OP_RTS2RTS_QP,
-               },
-               [MLX5_QP_STATE_SQD] = {
-                       [MLX5_QP_STATE_RST]     = MLX5_CMD_OP_2RST_QP,
-                       [MLX5_QP_STATE_ERR]     = MLX5_CMD_OP_2ERR_QP,
-               },
-               [MLX5_QP_STATE_SQER] = {
-                       [MLX5_QP_STATE_RST]     = MLX5_CMD_OP_2RST_QP,
-                       [MLX5_QP_STATE_ERR]     = MLX5_CMD_OP_2ERR_QP,
-                       [MLX5_QP_STATE_RTS]     = MLX5_CMD_OP_SQERR2RTS_QP,
-               },
-               [MLX5_QP_STATE_ERR] = {
-                       [MLX5_QP_STATE_RST]     = MLX5_CMD_OP_2RST_QP,
-                       [MLX5_QP_STATE_ERR]     = MLX5_CMD_OP_2ERR_QP,
-               }
-       };
-
        struct mlx5_modify_qp_mbox_out out;
        int err = 0;
-       u16 op;
-
-       if (cur_state >= MLX5_QP_NUM_STATE || new_state >= MLX5_QP_NUM_STATE ||
-           !optab[cur_state][new_state])
-               return -EINVAL;
 
        memset(&out, 0, sizeof(out));
-       op = optab[cur_state][new_state];
-       in->hdr.opcode = cpu_to_be16(op);
+       in->hdr.opcode = cpu_to_be16(operation);
        in->qpn = cpu_to_be32(qp->qpn);
        err = mlx5_cmd_exec(dev, in, sizeof(*in), &out, sizeof(out));
        if (err)
@@ -449,3 +474,67 @@ int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 qpn,
 }
 EXPORT_SYMBOL_GPL(mlx5_core_page_fault_resume);
 #endif
+
+int mlx5_core_create_rq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen,
+                               struct mlx5_core_qp *rq)
+{
+       int err;
+       u32 rqn;
+
+       err = mlx5_core_create_rq(dev, in, inlen, &rqn);
+       if (err)
+               return err;
+
+       rq->qpn = rqn;
+       err = create_qprqsq_common(dev, rq, MLX5_RES_RQ);
+       if (err)
+               goto err_destroy_rq;
+
+       return 0;
+
+err_destroy_rq:
+       mlx5_core_destroy_rq(dev, rq->qpn);
+
+       return err;
+}
+EXPORT_SYMBOL(mlx5_core_create_rq_tracked);
+
+void mlx5_core_destroy_rq_tracked(struct mlx5_core_dev *dev,
+                                 struct mlx5_core_qp *rq)
+{
+       destroy_qprqsq_common(dev, rq);
+       mlx5_core_destroy_rq(dev, rq->qpn);
+}
+EXPORT_SYMBOL(mlx5_core_destroy_rq_tracked);
+
+int mlx5_core_create_sq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen,
+                               struct mlx5_core_qp *sq)
+{
+       int err;
+       u32 sqn;
+
+       err = mlx5_core_create_sq(dev, in, inlen, &sqn);
+       if (err)
+               return err;
+
+       sq->qpn = sqn;
+       err = create_qprqsq_common(dev, sq, MLX5_RES_SQ);
+       if (err)
+               goto err_destroy_sq;
+
+       return 0;
+
+err_destroy_sq:
+       mlx5_core_destroy_sq(dev, sq->qpn);
+
+       return err;
+}
+EXPORT_SYMBOL(mlx5_core_create_sq_tracked);
+
+void mlx5_core_destroy_sq_tracked(struct mlx5_core_dev *dev,
+                                 struct mlx5_core_qp *sq)
+{
+       destroy_qprqsq_common(dev, sq);
+       mlx5_core_destroy_sq(dev, sq->qpn);
+}
+EXPORT_SYMBOL(mlx5_core_destroy_sq_tracked);
index ffada801976bff52ef05320d5605f08798af3129..04bc522605a03bb9cb7e6602f9c003ccfc28433b 100644 (file)
@@ -37,7 +37,7 @@
 #include <linux/mlx5/srq.h>
 #include <rdma/ib_verbs.h>
 #include "mlx5_core.h"
-#include "transobj.h"
+#include <linux/mlx5/transobj.h>
 
 void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type)
 {
@@ -241,8 +241,6 @@ static int create_xrc_srq_cmd(struct mlx5_core_dev *dev,
 
        memcpy(xrc_srqc, srqc, MLX5_ST_SZ_BYTES(srqc));
        memcpy(pas, in->pas, pas_size);
-       /* 0xffffff means we ask to work with cqe version 0 */
-       MLX5_SET(xrc_srqc,          xrc_srqc,  user_index, 0xffffff);
        MLX5_SET(create_xrc_srq_in, create_in, opcode,
                 MLX5_CMD_OP_CREATE_XRC_SRQ);
 
index d7068f54e80066acdcaf6bf081082d50feeebc57..03a5093ffeb72ab6f5c490d1eff42967ceb7f0f4 100644 (file)
@@ -32,9 +32,9 @@
 
 #include <linux/mlx5/driver.h>
 #include "mlx5_core.h"
-#include "transobj.h"
+#include <linux/mlx5/transobj.h>
 
-int mlx5_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn)
+int mlx5_core_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn)
 {
        u32 in[MLX5_ST_SZ_DW(alloc_transport_domain_in)];
        u32 out[MLX5_ST_SZ_DW(alloc_transport_domain_out)];
@@ -53,8 +53,9 @@ int mlx5_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn)
 
        return err;
 }
+EXPORT_SYMBOL(mlx5_core_alloc_transport_domain);
 
-void mlx5_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn)
+void mlx5_core_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn)
 {
        u32 in[MLX5_ST_SZ_DW(dealloc_transport_domain_in)];
        u32 out[MLX5_ST_SZ_DW(dealloc_transport_domain_out)];
@@ -68,6 +69,7 @@ void mlx5_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn)
 
        mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out));
 }
+EXPORT_SYMBOL(mlx5_core_dealloc_transport_domain);
 
 int mlx5_core_create_rq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *rqn)
 {
@@ -94,6 +96,7 @@ int mlx5_core_modify_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *in, int inlen)
        memset(out, 0, sizeof(out));
        return mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out));
 }
+EXPORT_SYMBOL(mlx5_core_modify_rq);
 
 void mlx5_core_destroy_rq(struct mlx5_core_dev *dev, u32 rqn)
 {
@@ -108,6 +111,18 @@ void mlx5_core_destroy_rq(struct mlx5_core_dev *dev, u32 rqn)
        mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out));
 }
 
+int mlx5_core_query_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *out)
+{
+       u32 in[MLX5_ST_SZ_DW(query_rq_in)] = {0};
+       int outlen = MLX5_ST_SZ_BYTES(query_rq_out);
+
+       MLX5_SET(query_rq_in, in, opcode, MLX5_CMD_OP_QUERY_RQ);
+       MLX5_SET(query_rq_in, in, rqn, rqn);
+
+       return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, outlen);
+}
+EXPORT_SYMBOL(mlx5_core_query_rq);
+
 int mlx5_core_create_sq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *sqn)
 {
        u32 out[MLX5_ST_SZ_DW(create_sq_out)];
@@ -133,6 +148,7 @@ int mlx5_core_modify_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *in, int inlen)
        memset(out, 0, sizeof(out));
        return mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out));
 }
+EXPORT_SYMBOL(mlx5_core_modify_sq);
 
 void mlx5_core_destroy_sq(struct mlx5_core_dev *dev, u32 sqn)
 {
@@ -147,6 +163,18 @@ void mlx5_core_destroy_sq(struct mlx5_core_dev *dev, u32 sqn)
        mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out));
 }
 
+int mlx5_core_query_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *out)
+{
+       u32 in[MLX5_ST_SZ_DW(query_sq_in)] = {0};
+       int outlen = MLX5_ST_SZ_BYTES(query_sq_out);
+
+       MLX5_SET(query_sq_in, in, opcode, MLX5_CMD_OP_QUERY_SQ);
+       MLX5_SET(query_sq_in, in, sqn, sqn);
+
+       return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, outlen);
+}
+EXPORT_SYMBOL(mlx5_core_query_sq);
+
 int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, int inlen,
                         u32 *tirn)
 {
@@ -162,6 +190,7 @@ int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, int inlen,
 
        return err;
 }
+EXPORT_SYMBOL(mlx5_core_create_tir);
 
 int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in,
                         int inlen)
@@ -187,6 +216,7 @@ void mlx5_core_destroy_tir(struct mlx5_core_dev *dev, u32 tirn)
 
        mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out));
 }
+EXPORT_SYMBOL(mlx5_core_destroy_tir);
 
 int mlx5_core_create_tis(struct mlx5_core_dev *dev, u32 *in, int inlen,
                         u32 *tisn)
@@ -203,6 +233,19 @@ int mlx5_core_create_tis(struct mlx5_core_dev *dev, u32 *in, int inlen,
 
        return err;
 }
+EXPORT_SYMBOL(mlx5_core_create_tis);
+
+int mlx5_core_modify_tis(struct mlx5_core_dev *dev, u32 tisn, u32 *in,
+                        int inlen)
+{
+       u32 out[MLX5_ST_SZ_DW(modify_tis_out)] = {0};
+
+       MLX5_SET(modify_tis_in, in, tisn, tisn);
+       MLX5_SET(modify_tis_in, in, opcode, MLX5_CMD_OP_MODIFY_TIS);
+
+       return mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_core_modify_tis);
 
 void mlx5_core_destroy_tis(struct mlx5_core_dev *dev, u32 tisn)
 {
@@ -216,6 +259,7 @@ void mlx5_core_destroy_tis(struct mlx5_core_dev *dev, u32 tisn)
 
        mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out));
 }
+EXPORT_SYMBOL(mlx5_core_destroy_tis);
 
 int mlx5_core_create_rmp(struct mlx5_core_dev *dev, u32 *in, int inlen,
                         u32 *rmpn)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.h b/drivers/net/ethernet/mellanox/mlx5/core/transobj.h
deleted file mode 100644 (file)
index 74cae51..0000000
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __TRANSOBJ_H__
-#define __TRANSOBJ_H__
-
-int mlx5_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn);
-void mlx5_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn);
-int mlx5_core_create_rq(struct mlx5_core_dev *dev, u32 *in, int inlen,
-                       u32 *rqn);
-int mlx5_core_modify_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *in, int inlen);
-void mlx5_core_destroy_rq(struct mlx5_core_dev *dev, u32 rqn);
-int mlx5_core_create_sq(struct mlx5_core_dev *dev, u32 *in, int inlen,
-                       u32 *sqn);
-int mlx5_core_modify_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *in, int inlen);
-void mlx5_core_destroy_sq(struct mlx5_core_dev *dev, u32 sqn);
-int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, int inlen,
-                        u32 *tirn);
-int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in,
-                        int inlen);
-void mlx5_core_destroy_tir(struct mlx5_core_dev *dev, u32 tirn);
-int mlx5_core_create_tis(struct mlx5_core_dev *dev, u32 *in, int inlen,
-                        u32 *tisn);
-void mlx5_core_destroy_tis(struct mlx5_core_dev *dev, u32 tisn);
-int mlx5_core_create_rmp(struct mlx5_core_dev *dev, u32 *in, int inlen,
-                        u32 *rmpn);
-int mlx5_core_modify_rmp(struct mlx5_core_dev *dev, u32 *in, int inlen);
-int mlx5_core_destroy_rmp(struct mlx5_core_dev *dev, u32 rmpn);
-int mlx5_core_query_rmp(struct mlx5_core_dev *dev, u32 rmpn, u32 *out);
-int mlx5_core_arm_rmp(struct mlx5_core_dev *dev, u32 rmpn, u16 lwm);
-int mlx5_core_create_xsrq(struct mlx5_core_dev *dev, u32 *in, int inlen,
-                         u32 *rmpn);
-int mlx5_core_destroy_xsrq(struct mlx5_core_dev *dev, u32 rmpn);
-int mlx5_core_query_xsrq(struct mlx5_core_dev *dev, u32 rmpn, u32 *out);
-int mlx5_core_arm_xsrq(struct mlx5_core_dev *dev, u32 rmpn, u16 lwm);
-
-int mlx5_core_create_rqt(struct mlx5_core_dev *dev, u32 *in, int inlen,
-                        u32 *rqtn);
-int mlx5_core_modify_rqt(struct mlx5_core_dev *dev, u32 rqtn, u32 *in,
-                        int inlen);
-void mlx5_core_destroy_rqt(struct mlx5_core_dev *dev, u32 rqtn);
-
-#endif /* __TRANSOBJ_H__ */
index 076197efea9b07916480a97a8258e84703a3db66..c7398b95aecdc0286480f85564e5a42660275bd6 100644 (file)
@@ -76,7 +76,7 @@ u8 mlx5_query_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport)
 
        return MLX5_GET(query_vport_state_out, out, admin_state);
 }
-EXPORT_SYMBOL(mlx5_query_vport_admin_state);
+EXPORT_SYMBOL_GPL(mlx5_query_vport_admin_state);
 
 int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
                                  u16 vport, u8 state)
@@ -104,7 +104,7 @@ int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
 
        return err;
 }
-EXPORT_SYMBOL(mlx5_modify_vport_admin_state);
+EXPORT_SYMBOL_GPL(mlx5_modify_vport_admin_state);
 
 static int mlx5_query_nic_vport_context(struct mlx5_core_dev *mdev, u16 vport,
                                        u32 *out, int outlen)
@@ -151,12 +151,9 @@ int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev,
                                nic_vport_context.permanent_address);
 
        err = mlx5_query_nic_vport_context(mdev, vport, out, outlen);
-       if (err)
-               goto out;
-
-       ether_addr_copy(addr, &out_addr[2]);
+       if (!err)
+               ether_addr_copy(addr, &out_addr[2]);
 
-out:
        kvfree(out);
        return err;
 }
@@ -197,7 +194,7 @@ int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *mdev,
 
        return err;
 }
-EXPORT_SYMBOL(mlx5_modify_nic_vport_mac_address);
+EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_mac_address);
 
 int mlx5_query_nic_vport_mac_list(struct mlx5_core_dev *dev,
                                  u32 vport,
@@ -430,6 +427,68 @@ int mlx5_modify_nic_vport_vlans(struct mlx5_core_dev *dev,
 }
 EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_vlans);
 
+int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev,
+                                          u64 *system_image_guid)
+{
+       u32 *out;
+       int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out);
+
+       out = mlx5_vzalloc(outlen);
+       if (!out)
+               return -ENOMEM;
+
+       mlx5_query_nic_vport_context(mdev, 0, out, outlen);
+
+       *system_image_guid = MLX5_GET64(query_nic_vport_context_out, out,
+                                       nic_vport_context.system_image_guid);
+
+       kfree(out);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_system_image_guid);
+
+int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid)
+{
+       u32 *out;
+       int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out);
+
+       out = mlx5_vzalloc(outlen);
+       if (!out)
+               return -ENOMEM;
+
+       mlx5_query_nic_vport_context(mdev, 0, out, outlen);
+
+       *node_guid = MLX5_GET64(query_nic_vport_context_out, out,
+                               nic_vport_context.node_guid);
+
+       kfree(out);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_node_guid);
+
+int mlx5_query_nic_vport_qkey_viol_cntr(struct mlx5_core_dev *mdev,
+                                       u16 *qkey_viol_cntr)
+{
+       u32 *out;
+       int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out);
+
+       out = mlx5_vzalloc(outlen);
+       if (!out)
+               return -ENOMEM;
+
+       mlx5_query_nic_vport_context(mdev, 0, out, outlen);
+
+       *qkey_viol_cntr = MLX5_GET(query_nic_vport_context_out, out,
+                                  nic_vport_context.qkey_violation_counter);
+
+       kfree(out);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_qkey_viol_cntr);
+
 int mlx5_query_hca_vport_gid(struct mlx5_core_dev *dev, u8 other_vport,
                             u8 port_num, u16  vf_num, u16 gid_index,
                             union ib_gid *gid)
@@ -750,3 +809,44 @@ int mlx5_modify_nic_vport_promisc(struct mlx5_core_dev *mdev,
        return err;
 }
 EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_promisc);
+
+enum mlx5_vport_roce_state {
+       MLX5_VPORT_ROCE_DISABLED = 0,
+       MLX5_VPORT_ROCE_ENABLED  = 1,
+};
+
+static int mlx5_nic_vport_update_roce_state(struct mlx5_core_dev *mdev,
+                                           enum mlx5_vport_roce_state state)
+{
+       void *in;
+       int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in);
+       int err;
+
+       in = mlx5_vzalloc(inlen);
+       if (!in) {
+               mlx5_core_warn(mdev, "failed to allocate inbox\n");
+               return -ENOMEM;
+       }
+
+       MLX5_SET(modify_nic_vport_context_in, in, field_select.roce_en, 1);
+       MLX5_SET(modify_nic_vport_context_in, in, nic_vport_context.roce_en,
+                state);
+
+       err = mlx5_modify_nic_vport_context(mdev, in, inlen);
+
+       kvfree(in);
+
+       return err;
+}
+
+int mlx5_nic_vport_enable_roce(struct mlx5_core_dev *mdev)
+{
+       return mlx5_nic_vport_update_roce_state(mdev, MLX5_VPORT_ROCE_ENABLED);
+}
+EXPORT_SYMBOL_GPL(mlx5_nic_vport_enable_roce);
+
+int mlx5_nic_vport_disable_roce(struct mlx5_core_dev *mdev)
+{
+       return mlx5_nic_vport_update_roce_state(mdev, MLX5_VPORT_ROCE_DISABLED);
+}
+EXPORT_SYMBOL_GPL(mlx5_nic_vport_disable_roce);
index 4d5535c4cddf3877109c9a0970f88606ec7f42de..7116472b462577935eb0bad50b986ba510a038b5 100644 (file)
@@ -1 +1,2 @@
+source "drivers/ntb/hw/amd/Kconfig"
 source "drivers/ntb/hw/intel/Kconfig"
index 175d7c92a569a5af90dc026acbd177444d8b9d90..532e0859b4a11bb6c90d357ad8bc00cbea86e006 100644 (file)
@@ -1 +1,2 @@
+obj-$(CONFIG_NTB_AMD)  += amd/
 obj-$(CONFIG_NTB_INTEL)        += intel/
diff --git a/drivers/ntb/hw/amd/Kconfig b/drivers/ntb/hw/amd/Kconfig
new file mode 100644 (file)
index 0000000..cfe903c
--- /dev/null
@@ -0,0 +1,7 @@
+config NTB_AMD
+       tristate "AMD Non-Transparent Bridge support"
+       depends on X86_64
+       help
+        This driver supports AMD NTB on capable Zeppelin hardware.
+
+        If unsure, say N.
diff --git a/drivers/ntb/hw/amd/Makefile b/drivers/ntb/hw/amd/Makefile
new file mode 100644 (file)
index 0000000..ad54da9
--- /dev/null
@@ -0,0 +1 @@
+obj-$(CONFIG_NTB_AMD) += ntb_hw_amd.o
diff --git a/drivers/ntb/hw/amd/ntb_hw_amd.c b/drivers/ntb/hw/amd/ntb_hw_amd.c
new file mode 100644 (file)
index 0000000..588803a
--- /dev/null
@@ -0,0 +1,1143 @@
+/*
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ *   redistributing this file, you may do so under either license.
+ *
+ *   GPL LICENSE SUMMARY
+ *
+ *   Copyright (C) 2016 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of version 2 of the GNU General Public License as
+ *   published by the Free Software Foundation.
+ *
+ *   BSD LICENSE
+ *
+ *   Copyright (C) 2016 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copy
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of AMD Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * AMD PCIe NTB Linux driver
+ *
+ * Contact Information:
+ * Xiangliang Yu <Xiangliang.Yu@amd.com>
+ */
+
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/acpi.h>
+#include <linux/pci.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/ntb.h>
+
+#include "ntb_hw_amd.h"
+
+#define NTB_NAME       "ntb_hw_amd"
+#define NTB_DESC       "AMD(R) PCI-E Non-Transparent Bridge Driver"
+#define NTB_VER                "1.0"
+
+MODULE_DESCRIPTION(NTB_DESC);
+MODULE_VERSION(NTB_VER);
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("AMD Inc.");
+
+static const struct file_operations amd_ntb_debugfs_info;
+static struct dentry *debugfs_dir;
+
+static int ndev_mw_to_bar(struct amd_ntb_dev *ndev, int idx)
+{
+       if (idx < 0 || idx > ndev->mw_count)
+               return -EINVAL;
+
+       return 1 << idx;
+}
+
+static int amd_ntb_mw_count(struct ntb_dev *ntb)
+{
+       return ntb_ndev(ntb)->mw_count;
+}
+
+static int amd_ntb_mw_get_range(struct ntb_dev *ntb, int idx,
+                               phys_addr_t *base,
+                               resource_size_t *size,
+                               resource_size_t *align,
+                               resource_size_t *align_size)
+{
+       struct amd_ntb_dev *ndev = ntb_ndev(ntb);
+       int bar;
+
+       bar = ndev_mw_to_bar(ndev, idx);
+       if (bar < 0)
+               return bar;
+
+       if (base)
+               *base = pci_resource_start(ndev->ntb.pdev, bar);
+
+       if (size)
+               *size = pci_resource_len(ndev->ntb.pdev, bar);
+
+       if (align)
+               *align = SZ_4K;
+
+       if (align_size)
+               *align_size = 1;
+
+       return 0;
+}
+
+static int amd_ntb_mw_set_trans(struct ntb_dev *ntb, int idx,
+                               dma_addr_t addr, resource_size_t size)
+{
+       struct amd_ntb_dev *ndev = ntb_ndev(ntb);
+       unsigned long xlat_reg, limit_reg = 0;
+       resource_size_t mw_size;
+       void __iomem *mmio, *peer_mmio;
+       u64 base_addr, limit, reg_val;
+       int bar;
+
+       bar = ndev_mw_to_bar(ndev, idx);
+       if (bar < 0)
+               return bar;
+
+       mw_size = pci_resource_len(ndev->ntb.pdev, bar);
+
+       /* make sure the range fits in the usable mw size */
+       if (size > mw_size)
+               return -EINVAL;
+
+       mmio = ndev->self_mmio;
+       peer_mmio = ndev->peer_mmio;
+
+       base_addr = pci_resource_start(ndev->ntb.pdev, bar);
+
+       if (bar != 1) {
+               xlat_reg = AMD_BAR23XLAT_OFFSET + ((bar - 2) << 3);
+               limit_reg = AMD_BAR23LMT_OFFSET + ((bar - 2) << 3);
+
+               /* Set the limit if supported */
+               limit = base_addr + size;
+
+               /* set and verify setting the translation address */
+               write64(addr, peer_mmio + xlat_reg);
+               reg_val = read64(peer_mmio + xlat_reg);
+               if (reg_val != addr) {
+                       write64(0, peer_mmio + xlat_reg);
+                       return -EIO;
+               }
+
+               /* set and verify setting the limit */
+               write64(limit, mmio + limit_reg);
+               reg_val = read64(mmio + limit_reg);
+               if (reg_val != limit) {
+                       write64(base_addr, mmio + limit_reg);
+                       write64(0, peer_mmio + xlat_reg);
+                       return -EIO;
+               }
+       } else {
+               xlat_reg = AMD_BAR1XLAT_OFFSET;
+               limit_reg = AMD_BAR1LMT_OFFSET;
+
+               /* split bar addr range must all be 32 bit */
+               if (addr & (~0ull << 32))
+                       return -EINVAL;
+               if ((addr + size) & (~0ull << 32))
+                       return -EINVAL;
+
+               /* Set the limit if supported */
+               limit = base_addr + size;
+
+               /* set and verify setting the translation address */
+               write64(addr, peer_mmio + xlat_reg);
+               reg_val = read64(peer_mmio + xlat_reg);
+               if (reg_val != addr) {
+                       write64(0, peer_mmio + xlat_reg);
+                       return -EIO;
+               }
+
+               /* set and verify setting the limit */
+               writel(limit, mmio + limit_reg);
+               reg_val = readl(mmio + limit_reg);
+               if (reg_val != limit) {
+                       writel(base_addr, mmio + limit_reg);
+                       writel(0, peer_mmio + xlat_reg);
+                       return -EIO;
+               }
+       }
+
+       return 0;
+}
+
+static int amd_link_is_up(struct amd_ntb_dev *ndev)
+{
+       if (!ndev->peer_sta)
+               return NTB_LNK_STA_ACTIVE(ndev->cntl_sta);
+
+       /* If peer_sta is reset or D0 event, the ISR has
+        * started a timer to check link status of hardware.
+        * So here just clear status bit. And if peer_sta is
+        * D3 or PME_TO, D0/reset event will be happened when
+        * system wakeup/poweron, so do nothing here.
+        */
+       if (ndev->peer_sta & AMD_PEER_RESET_EVENT)
+               ndev->peer_sta &= ~AMD_PEER_RESET_EVENT;
+       else if (ndev->peer_sta & AMD_PEER_D0_EVENT)
+               ndev->peer_sta = 0;
+
+       return 0;
+}
+
+static int amd_ntb_link_is_up(struct ntb_dev *ntb,
+                             enum ntb_speed *speed,
+                             enum ntb_width *width)
+{
+       struct amd_ntb_dev *ndev = ntb_ndev(ntb);
+       int ret = 0;
+
+       if (amd_link_is_up(ndev)) {
+               if (speed)
+                       *speed = NTB_LNK_STA_SPEED(ndev->lnk_sta);
+               if (width)
+                       *width = NTB_LNK_STA_WIDTH(ndev->lnk_sta);
+
+               dev_dbg(ndev_dev(ndev), "link is up.\n");
+
+               ret = 1;
+       } else {
+               if (speed)
+                       *speed = NTB_SPEED_NONE;
+               if (width)
+                       *width = NTB_WIDTH_NONE;
+
+               dev_dbg(ndev_dev(ndev), "link is down.\n");
+       }
+
+       return ret;
+}
+
+static int amd_ntb_link_enable(struct ntb_dev *ntb,
+                              enum ntb_speed max_speed,
+                              enum ntb_width max_width)
+{
+       struct amd_ntb_dev *ndev = ntb_ndev(ntb);
+       void __iomem *mmio = ndev->self_mmio;
+       u32 ntb_ctl;
+
+       /* Enable event interrupt */
+       ndev->int_mask &= ~AMD_EVENT_INTMASK;
+       writel(ndev->int_mask, mmio + AMD_INTMASK_OFFSET);
+
+       if (ndev->ntb.topo == NTB_TOPO_SEC)
+               return -EINVAL;
+       dev_dbg(ndev_dev(ndev), "Enabling Link.\n");
+
+       ntb_ctl = readl(mmio + AMD_CNTL_OFFSET);
+       ntb_ctl |= (PMM_REG_CTL | SMM_REG_CTL);
+       writel(ntb_ctl, mmio + AMD_CNTL_OFFSET);
+
+       return 0;
+}
+
+static int amd_ntb_link_disable(struct ntb_dev *ntb)
+{
+       struct amd_ntb_dev *ndev = ntb_ndev(ntb);
+       void __iomem *mmio = ndev->self_mmio;
+       u32 ntb_ctl;
+
+       /* Disable event interrupt */
+       ndev->int_mask |= AMD_EVENT_INTMASK;
+       writel(ndev->int_mask, mmio + AMD_INTMASK_OFFSET);
+
+       if (ndev->ntb.topo == NTB_TOPO_SEC)
+               return -EINVAL;
+       dev_dbg(ndev_dev(ndev), "Enabling Link.\n");
+
+       ntb_ctl = readl(mmio + AMD_CNTL_OFFSET);
+       ntb_ctl &= ~(PMM_REG_CTL | SMM_REG_CTL);
+       writel(ntb_ctl, mmio + AMD_CNTL_OFFSET);
+
+       return 0;
+}
+
+static u64 amd_ntb_db_valid_mask(struct ntb_dev *ntb)
+{
+       return ntb_ndev(ntb)->db_valid_mask;
+}
+
+static int amd_ntb_db_vector_count(struct ntb_dev *ntb)
+{
+       return ntb_ndev(ntb)->db_count;
+}
+
+static u64 amd_ntb_db_vector_mask(struct ntb_dev *ntb, int db_vector)
+{
+       struct amd_ntb_dev *ndev = ntb_ndev(ntb);
+
+       if (db_vector < 0 || db_vector > ndev->db_count)
+               return 0;
+
+       return ntb_ndev(ntb)->db_valid_mask & (1 << db_vector);
+}
+
+static u64 amd_ntb_db_read(struct ntb_dev *ntb)
+{
+       struct amd_ntb_dev *ndev = ntb_ndev(ntb);
+       void __iomem *mmio = ndev->self_mmio;
+
+       return (u64)readw(mmio + AMD_DBSTAT_OFFSET);
+}
+
+static int amd_ntb_db_clear(struct ntb_dev *ntb, u64 db_bits)
+{
+       struct amd_ntb_dev *ndev = ntb_ndev(ntb);
+       void __iomem *mmio = ndev->self_mmio;
+
+       writew((u16)db_bits, mmio + AMD_DBSTAT_OFFSET);
+
+       return 0;
+}
+
+static int amd_ntb_db_set_mask(struct ntb_dev *ntb, u64 db_bits)
+{
+       struct amd_ntb_dev *ndev = ntb_ndev(ntb);
+       void __iomem *mmio = ndev->self_mmio;
+       unsigned long flags;
+
+       if (db_bits & ~ndev->db_valid_mask)
+               return -EINVAL;
+
+       spin_lock_irqsave(&ndev->db_mask_lock, flags);
+       ndev->db_mask |= db_bits;
+       writew((u16)ndev->db_mask, mmio + AMD_DBMASK_OFFSET);
+       spin_unlock_irqrestore(&ndev->db_mask_lock, flags);
+
+       return 0;
+}
+
+static int amd_ntb_db_clear_mask(struct ntb_dev *ntb, u64 db_bits)
+{
+       struct amd_ntb_dev *ndev = ntb_ndev(ntb);
+       void __iomem *mmio = ndev->self_mmio;
+       unsigned long flags;
+
+       if (db_bits & ~ndev->db_valid_mask)
+               return -EINVAL;
+
+       spin_lock_irqsave(&ndev->db_mask_lock, flags);
+       ndev->db_mask &= ~db_bits;
+       writew((u16)ndev->db_mask, mmio + AMD_DBMASK_OFFSET);
+       spin_unlock_irqrestore(&ndev->db_mask_lock, flags);
+
+       return 0;
+}
+
+static int amd_ntb_peer_db_addr(struct ntb_dev *ntb,
+                               phys_addr_t *db_addr,
+                               resource_size_t *db_size)
+{
+       struct amd_ntb_dev *ndev = ntb_ndev(ntb);
+
+       if (db_addr)
+               *db_addr = (phys_addr_t)(ndev->peer_mmio + AMD_DBREQ_OFFSET);
+       if (db_size)
+               *db_size = sizeof(u32);
+
+       return 0;
+}
+
+static int amd_ntb_peer_db_set(struct ntb_dev *ntb, u64 db_bits)
+{
+       struct amd_ntb_dev *ndev = ntb_ndev(ntb);
+       void __iomem *mmio = ndev->self_mmio;
+
+       writew((u16)db_bits, mmio + AMD_DBREQ_OFFSET);
+
+       return 0;
+}
+
+static int amd_ntb_spad_count(struct ntb_dev *ntb)
+{
+       return ntb_ndev(ntb)->spad_count;
+}
+
+static u32 amd_ntb_spad_read(struct ntb_dev *ntb, int idx)
+{
+       struct amd_ntb_dev *ndev = ntb_ndev(ntb);
+       void __iomem *mmio = ndev->self_mmio;
+       u32 offset;
+
+       if (idx < 0 || idx >= ndev->spad_count)
+               return 0;
+
+       offset = ndev->self_spad + (idx << 2);
+       return readl(mmio + AMD_SPAD_OFFSET + offset);
+}
+
+static int amd_ntb_spad_write(struct ntb_dev *ntb,
+                             int idx, u32 val)
+{
+       struct amd_ntb_dev *ndev = ntb_ndev(ntb);
+       void __iomem *mmio = ndev->self_mmio;
+       u32 offset;
+
+       if (idx < 0 || idx >= ndev->spad_count)
+               return -EINVAL;
+
+       offset = ndev->self_spad + (idx << 2);
+       writel(val, mmio + AMD_SPAD_OFFSET + offset);
+
+       return 0;
+}
+
+static int amd_ntb_peer_spad_addr(struct ntb_dev *ntb, int idx,
+                                 phys_addr_t *spad_addr)
+{
+       struct amd_ntb_dev *ndev = ntb_ndev(ntb);
+
+       if (idx < 0 || idx >= ndev->spad_count)
+               return -EINVAL;
+
+       if (spad_addr)
+               *spad_addr = (phys_addr_t)(ndev->self_mmio + AMD_SPAD_OFFSET +
+                                          ndev->peer_spad + (idx << 2));
+       return 0;
+}
+
+static u32 amd_ntb_peer_spad_read(struct ntb_dev *ntb, int idx)
+{
+       struct amd_ntb_dev *ndev = ntb_ndev(ntb);
+       void __iomem *mmio = ndev->self_mmio;
+       u32 offset;
+
+       if (idx < 0 || idx >= ndev->spad_count)
+               return -EINVAL;
+
+       offset = ndev->peer_spad + (idx << 2);
+       return readl(mmio + AMD_SPAD_OFFSET + offset);
+}
+
+static int amd_ntb_peer_spad_write(struct ntb_dev *ntb,
+                                  int idx, u32 val)
+{
+       struct amd_ntb_dev *ndev = ntb_ndev(ntb);
+       void __iomem *mmio = ndev->self_mmio;
+       u32 offset;
+
+       if (idx < 0 || idx >= ndev->spad_count)
+               return -EINVAL;
+
+       offset = ndev->peer_spad + (idx << 2);
+       writel(val, mmio + AMD_SPAD_OFFSET + offset);
+
+       return 0;
+}
+
+static const struct ntb_dev_ops amd_ntb_ops = {
+       .mw_count               = amd_ntb_mw_count,
+       .mw_get_range           = amd_ntb_mw_get_range,
+       .mw_set_trans           = amd_ntb_mw_set_trans,
+       .link_is_up             = amd_ntb_link_is_up,
+       .link_enable            = amd_ntb_link_enable,
+       .link_disable           = amd_ntb_link_disable,
+       .db_valid_mask          = amd_ntb_db_valid_mask,
+       .db_vector_count        = amd_ntb_db_vector_count,
+       .db_vector_mask         = amd_ntb_db_vector_mask,
+       .db_read                = amd_ntb_db_read,
+       .db_clear               = amd_ntb_db_clear,
+       .db_set_mask            = amd_ntb_db_set_mask,
+       .db_clear_mask          = amd_ntb_db_clear_mask,
+       .peer_db_addr           = amd_ntb_peer_db_addr,
+       .peer_db_set            = amd_ntb_peer_db_set,
+       .spad_count             = amd_ntb_spad_count,
+       .spad_read              = amd_ntb_spad_read,
+       .spad_write             = amd_ntb_spad_write,
+       .peer_spad_addr         = amd_ntb_peer_spad_addr,
+       .peer_spad_read         = amd_ntb_peer_spad_read,
+       .peer_spad_write        = amd_ntb_peer_spad_write,
+};
+
+static void amd_ack_smu(struct amd_ntb_dev *ndev, u32 bit)
+{
+       void __iomem *mmio = ndev->self_mmio;
+       int reg;
+
+       reg = readl(mmio + AMD_SMUACK_OFFSET);
+       reg |= bit;
+       writel(reg, mmio + AMD_SMUACK_OFFSET);
+
+       ndev->peer_sta |= bit;
+}
+
+static void amd_handle_event(struct amd_ntb_dev *ndev, int vec)
+{
+       void __iomem *mmio = ndev->self_mmio;
+       u32 status;
+
+       status = readl(mmio + AMD_INTSTAT_OFFSET);
+       if (!(status & AMD_EVENT_INTMASK))
+               return;
+
+       dev_dbg(ndev_dev(ndev), "status = 0x%x and vec = %d\n", status, vec);
+
+       status &= AMD_EVENT_INTMASK;
+       switch (status) {
+       case AMD_PEER_FLUSH_EVENT:
+               dev_info(ndev_dev(ndev), "Flush is done.\n");
+               break;
+       case AMD_PEER_RESET_EVENT:
+               amd_ack_smu(ndev, AMD_PEER_RESET_EVENT);
+
+               /* link down first */
+               ntb_link_event(&ndev->ntb);
+               /* polling peer status */
+               schedule_delayed_work(&ndev->hb_timer, AMD_LINK_HB_TIMEOUT);
+
+               break;
+       case AMD_PEER_D3_EVENT:
+       case AMD_PEER_PMETO_EVENT:
+               amd_ack_smu(ndev, status);
+
+               /* link down */
+               ntb_link_event(&ndev->ntb);
+
+               break;
+       case AMD_PEER_D0_EVENT:
+               mmio = ndev->peer_mmio;
+               status = readl(mmio + AMD_PMESTAT_OFFSET);
+               /* check if this is WAKEUP event */
+               if (status & 0x1)
+                       dev_info(ndev_dev(ndev), "Wakeup is done.\n");
+
+               amd_ack_smu(ndev, AMD_PEER_D0_EVENT);
+
+               /* start a timer to poll link status */
+               schedule_delayed_work(&ndev->hb_timer,
+                                     AMD_LINK_HB_TIMEOUT);
+               break;
+       default:
+               dev_info(ndev_dev(ndev), "event status = 0x%x.\n", status);
+               break;
+       }
+}
+
+static irqreturn_t ndev_interrupt(struct amd_ntb_dev *ndev, int vec)
+{
+       dev_dbg(ndev_dev(ndev), "vec %d\n", vec);
+
+       if (vec > (AMD_DB_CNT - 1) || (ndev->msix_vec_count == 1))
+               amd_handle_event(ndev, vec);
+
+       if (vec < AMD_DB_CNT)
+               ntb_db_event(&ndev->ntb, vec);
+
+       return IRQ_HANDLED;
+}
+
+static irqreturn_t ndev_vec_isr(int irq, void *dev)
+{
+       struct amd_ntb_vec *nvec = dev;
+
+       return ndev_interrupt(nvec->ndev, nvec->num);
+}
+
+static irqreturn_t ndev_irq_isr(int irq, void *dev)
+{
+       struct amd_ntb_dev *ndev = dev;
+
+       return ndev_interrupt(ndev, irq - ndev_pdev(ndev)->irq);
+}
+
+static int ndev_init_isr(struct amd_ntb_dev *ndev,
+                        int msix_min, int msix_max)
+{
+       struct pci_dev *pdev;
+       int rc, i, msix_count, node;
+
+       pdev = ndev_pdev(ndev);
+
+       node = dev_to_node(&pdev->dev);
+
+       ndev->db_mask = ndev->db_valid_mask;
+
+       /* Try to set up msix irq */
+       ndev->vec = kzalloc_node(msix_max * sizeof(*ndev->vec),
+                                GFP_KERNEL, node);
+       if (!ndev->vec)
+               goto err_msix_vec_alloc;
+
+       ndev->msix = kzalloc_node(msix_max * sizeof(*ndev->msix),
+                                 GFP_KERNEL, node);
+       if (!ndev->msix)
+               goto err_msix_alloc;
+
+       for (i = 0; i < msix_max; ++i)
+               ndev->msix[i].entry = i;
+
+       msix_count = pci_enable_msix_range(pdev, ndev->msix,
+                                          msix_min, msix_max);
+       if (msix_count < 0)
+               goto err_msix_enable;
+
+       /* NOTE: Disable MSIX if msix count is less than 16 because of
+        * hardware limitation.
+        */
+       if (msix_count < msix_min) {
+               pci_disable_msix(pdev);
+               goto err_msix_enable;
+       }
+
+       for (i = 0; i < msix_count; ++i) {
+               ndev->vec[i].ndev = ndev;
+               ndev->vec[i].num = i;
+               rc = request_irq(ndev->msix[i].vector, ndev_vec_isr, 0,
+                                "ndev_vec_isr", &ndev->vec[i]);
+               if (rc)
+                       goto err_msix_request;
+       }
+
+       dev_dbg(ndev_dev(ndev), "Using msix interrupts\n");
+       ndev->db_count = msix_min;
+       ndev->msix_vec_count = msix_max;
+       return 0;
+
+err_msix_request:
+       while (i-- > 0)
+               free_irq(ndev->msix[i].vector, ndev);
+       pci_disable_msix(pdev);
+err_msix_enable:
+       kfree(ndev->msix);
+err_msix_alloc:
+       kfree(ndev->vec);
+err_msix_vec_alloc:
+       ndev->msix = NULL;
+       ndev->vec = NULL;
+
+       /* Try to set up msi irq */
+       rc = pci_enable_msi(pdev);
+       if (rc)
+               goto err_msi_enable;
+
+       rc = request_irq(pdev->irq, ndev_irq_isr, 0,
+                        "ndev_irq_isr", ndev);
+       if (rc)
+               goto err_msi_request;
+
+       dev_dbg(ndev_dev(ndev), "Using msi interrupts\n");
+       ndev->db_count = 1;
+       ndev->msix_vec_count = 1;
+       return 0;
+
+err_msi_request:
+       pci_disable_msi(pdev);
+err_msi_enable:
+
+       /* Try to set up intx irq */
+       pci_intx(pdev, 1);
+
+       rc = request_irq(pdev->irq, ndev_irq_isr, IRQF_SHARED,
+                        "ndev_irq_isr", ndev);
+       if (rc)
+               goto err_intx_request;
+
+       dev_dbg(ndev_dev(ndev), "Using intx interrupts\n");
+       ndev->db_count = 1;
+       ndev->msix_vec_count = 1;
+       return 0;
+
+err_intx_request:
+       return rc;
+}
+
+static void ndev_deinit_isr(struct amd_ntb_dev *ndev)
+{
+       struct pci_dev *pdev;
+       void __iomem *mmio = ndev->self_mmio;
+       int i;
+
+       pdev = ndev_pdev(ndev);
+
+       /* Mask all doorbell interrupts */
+       ndev->db_mask = ndev->db_valid_mask;
+       writel(ndev->db_mask, mmio + AMD_DBMASK_OFFSET);
+
+       if (ndev->msix) {
+               i = ndev->msix_vec_count;
+               while (i--)
+                       free_irq(ndev->msix[i].vector, &ndev->vec[i]);
+               pci_disable_msix(pdev);
+               kfree(ndev->msix);
+               kfree(ndev->vec);
+       } else {
+               free_irq(pdev->irq, ndev);
+               if (pci_dev_msi_enabled(pdev))
+                       pci_disable_msi(pdev);
+               else
+                       pci_intx(pdev, 0);
+       }
+}
+
+static ssize_t ndev_debugfs_read(struct file *filp, char __user *ubuf,
+                                size_t count, loff_t *offp)
+{
+       struct amd_ntb_dev *ndev;
+       void __iomem *mmio;
+       char *buf;
+       size_t buf_size;
+       ssize_t ret, off;
+       union { u64 v64; u32 v32; u16 v16; } u;
+
+       ndev = filp->private_data;
+       mmio = ndev->self_mmio;
+
+       buf_size = min(count, 0x800ul);
+
+       buf = kmalloc(buf_size, GFP_KERNEL);
+       if (!buf)
+               return -ENOMEM;
+
+       off = 0;
+
+       off += scnprintf(buf + off, buf_size - off,
+                        "NTB Device Information:\n");
+
+       off += scnprintf(buf + off, buf_size - off,
+                        "Connection Topology -\t%s\n",
+                        ntb_topo_string(ndev->ntb.topo));
+
+       off += scnprintf(buf + off, buf_size - off,
+                        "LNK STA -\t\t%#06x\n", ndev->lnk_sta);
+
+       if (!amd_link_is_up(ndev)) {
+               off += scnprintf(buf + off, buf_size - off,
+                                "Link Status -\t\tDown\n");
+       } else {
+               off += scnprintf(buf + off, buf_size - off,
+                                "Link Status -\t\tUp\n");
+               off += scnprintf(buf + off, buf_size - off,
+                                "Link Speed -\t\tPCI-E Gen %u\n",
+                                NTB_LNK_STA_SPEED(ndev->lnk_sta));
+               off += scnprintf(buf + off, buf_size - off,
+                                "Link Width -\t\tx%u\n",
+                                NTB_LNK_STA_WIDTH(ndev->lnk_sta));
+       }
+
+       off += scnprintf(buf + off, buf_size - off,
+                        "Memory Window Count -\t%u\n", ndev->mw_count);
+       off += scnprintf(buf + off, buf_size - off,
+                        "Scratchpad Count -\t%u\n", ndev->spad_count);
+       off += scnprintf(buf + off, buf_size - off,
+                        "Doorbell Count -\t%u\n", ndev->db_count);
+       off += scnprintf(buf + off, buf_size - off,
+                        "MSIX Vector Count -\t%u\n", ndev->msix_vec_count);
+
+       off += scnprintf(buf + off, buf_size - off,
+                        "Doorbell Valid Mask -\t%#llx\n", ndev->db_valid_mask);
+
+       u.v32 = readl(ndev->self_mmio + AMD_DBMASK_OFFSET);
+       off += scnprintf(buf + off, buf_size - off,
+                        "Doorbell Mask -\t\t\t%#06x\n", u.v32);
+
+       u.v32 = readl(mmio + AMD_DBSTAT_OFFSET);
+       off += scnprintf(buf + off, buf_size - off,
+                        "Doorbell Bell -\t\t\t%#06x\n", u.v32);
+
+       off += scnprintf(buf + off, buf_size - off,
+                        "\nNTB Incoming XLAT:\n");
+
+       u.v64 = read64(mmio + AMD_BAR1XLAT_OFFSET);
+       off += scnprintf(buf + off, buf_size - off,
+                        "XLAT1 -\t\t%#018llx\n", u.v64);
+
+       u.v64 = read64(ndev->self_mmio + AMD_BAR23XLAT_OFFSET);
+       off += scnprintf(buf + off, buf_size - off,
+                        "XLAT23 -\t\t%#018llx\n", u.v64);
+
+       u.v64 = read64(ndev->self_mmio + AMD_BAR45XLAT_OFFSET);
+       off += scnprintf(buf + off, buf_size - off,
+                        "XLAT45 -\t\t%#018llx\n", u.v64);
+
+       u.v32 = readl(mmio + AMD_BAR1LMT_OFFSET);
+       off += scnprintf(buf + off, buf_size - off,
+                        "LMT1 -\t\t\t%#06x\n", u.v32);
+
+       u.v64 = read64(ndev->self_mmio + AMD_BAR23LMT_OFFSET);
+       off += scnprintf(buf + off, buf_size - off,
+                        "LMT23 -\t\t\t%#018llx\n", u.v64);
+
+       u.v64 = read64(ndev->self_mmio + AMD_BAR45LMT_OFFSET);
+       off += scnprintf(buf + off, buf_size - off,
+                        "LMT45 -\t\t\t%#018llx\n", u.v64);
+
+       ret = simple_read_from_buffer(ubuf, count, offp, buf, off);
+       kfree(buf);
+       return ret;
+}
+
+static void ndev_init_debugfs(struct amd_ntb_dev *ndev)
+{
+       if (!debugfs_dir) {
+               ndev->debugfs_dir = NULL;
+               ndev->debugfs_info = NULL;
+       } else {
+               ndev->debugfs_dir =
+                       debugfs_create_dir(ndev_name(ndev), debugfs_dir);
+               if (!ndev->debugfs_dir)
+                       ndev->debugfs_info = NULL;
+               else
+                       ndev->debugfs_info =
+                               debugfs_create_file("info", S_IRUSR,
+                                                   ndev->debugfs_dir, ndev,
+                                                   &amd_ntb_debugfs_info);
+       }
+}
+
+static void ndev_deinit_debugfs(struct amd_ntb_dev *ndev)
+{
+       debugfs_remove_recursive(ndev->debugfs_dir);
+}
+
+static inline void ndev_init_struct(struct amd_ntb_dev *ndev,
+                                   struct pci_dev *pdev)
+{
+       ndev->ntb.pdev = pdev;
+       ndev->ntb.topo = NTB_TOPO_NONE;
+       ndev->ntb.ops = &amd_ntb_ops;
+       ndev->int_mask = AMD_EVENT_INTMASK;
+       spin_lock_init(&ndev->db_mask_lock);
+}
+
+static int amd_poll_link(struct amd_ntb_dev *ndev)
+{
+       void __iomem *mmio = ndev->peer_mmio;
+       u32 reg, stat;
+       int rc;
+
+       reg = readl(mmio + AMD_SIDEINFO_OFFSET);
+       reg &= NTB_LIN_STA_ACTIVE_BIT;
+
+       dev_dbg(ndev_dev(ndev), "%s: reg_val = 0x%x.\n", __func__, reg);
+
+       if (reg == ndev->cntl_sta)
+               return 0;
+
+       ndev->cntl_sta = reg;
+
+       rc = pci_read_config_dword(ndev->ntb.pdev,
+                                  AMD_LINK_STATUS_OFFSET, &stat);
+       if (rc)
+               return 0;
+       ndev->lnk_sta = stat;
+
+       return 1;
+}
+
+static void amd_link_hb(struct work_struct *work)
+{
+       struct amd_ntb_dev *ndev = hb_ndev(work);
+
+       if (amd_poll_link(ndev))
+               ntb_link_event(&ndev->ntb);
+
+       if (!amd_link_is_up(ndev))
+               schedule_delayed_work(&ndev->hb_timer, AMD_LINK_HB_TIMEOUT);
+}
+
+static int amd_init_isr(struct amd_ntb_dev *ndev)
+{
+       return ndev_init_isr(ndev, AMD_DB_CNT, AMD_MSIX_VECTOR_CNT);
+}
+
+static void amd_init_side_info(struct amd_ntb_dev *ndev)
+{
+       void __iomem *mmio = ndev->self_mmio;
+       unsigned int reg;
+
+       reg = readl(mmio + AMD_SIDEINFO_OFFSET);
+       if (!(reg & AMD_SIDE_READY)) {
+               reg |= AMD_SIDE_READY;
+               writel(reg, mmio + AMD_SIDEINFO_OFFSET);
+       }
+}
+
+static void amd_deinit_side_info(struct amd_ntb_dev *ndev)
+{
+       void __iomem *mmio = ndev->self_mmio;
+       unsigned int reg;
+
+       reg = readl(mmio + AMD_SIDEINFO_OFFSET);
+       if (reg & AMD_SIDE_READY) {
+               reg &= ~AMD_SIDE_READY;
+               writel(reg, mmio + AMD_SIDEINFO_OFFSET);
+               readl(mmio + AMD_SIDEINFO_OFFSET);
+       }
+}
+
+static int amd_init_ntb(struct amd_ntb_dev *ndev)
+{
+       void __iomem *mmio = ndev->self_mmio;
+
+       ndev->mw_count = AMD_MW_CNT;
+       ndev->spad_count = AMD_SPADS_CNT;
+       ndev->db_count = AMD_DB_CNT;
+
+       switch (ndev->ntb.topo) {
+       case NTB_TOPO_PRI:
+       case NTB_TOPO_SEC:
+               ndev->spad_count >>= 1;
+               if (ndev->ntb.topo == NTB_TOPO_PRI) {
+                       ndev->self_spad = 0;
+                       ndev->peer_spad = 0x20;
+               } else {
+                       ndev->self_spad = 0x20;
+                       ndev->peer_spad = 0;
+               }
+
+               INIT_DELAYED_WORK(&ndev->hb_timer, amd_link_hb);
+               schedule_delayed_work(&ndev->hb_timer, AMD_LINK_HB_TIMEOUT);
+
+               break;
+       default:
+               dev_err(ndev_dev(ndev), "AMD NTB does not support B2B mode.\n");
+               return -EINVAL;
+       }
+
+       ndev->db_valid_mask = BIT_ULL(ndev->db_count) - 1;
+
+       /* Mask event interrupts */
+       writel(ndev->int_mask, mmio + AMD_INTMASK_OFFSET);
+
+       return 0;
+}
+
+static enum ntb_topo amd_get_topo(struct amd_ntb_dev *ndev)
+{
+       void __iomem *mmio = ndev->self_mmio;
+       u32 info;
+
+       info = readl(mmio + AMD_SIDEINFO_OFFSET);
+       if (info & AMD_SIDE_MASK)
+               return NTB_TOPO_SEC;
+       else
+               return NTB_TOPO_PRI;
+}
+
+static int amd_init_dev(struct amd_ntb_dev *ndev)
+{
+       struct pci_dev *pdev;
+       int rc = 0;
+
+       pdev = ndev_pdev(ndev);
+
+       ndev->ntb.topo = amd_get_topo(ndev);
+       dev_dbg(ndev_dev(ndev), "AMD NTB topo is %s\n",
+               ntb_topo_string(ndev->ntb.topo));
+
+       rc = amd_init_ntb(ndev);
+       if (rc)
+               return rc;
+
+       rc = amd_init_isr(ndev);
+       if (rc) {
+               dev_err(ndev_dev(ndev), "fail to init isr.\n");
+               return rc;
+       }
+
+       ndev->db_valid_mask = BIT_ULL(ndev->db_count) - 1;
+
+       return 0;
+}
+
+static void amd_deinit_dev(struct amd_ntb_dev *ndev)
+{
+       cancel_delayed_work_sync(&ndev->hb_timer);
+
+       ndev_deinit_isr(ndev);
+}
+
+static int amd_ntb_init_pci(struct amd_ntb_dev *ndev,
+                           struct pci_dev *pdev)
+{
+       int rc;
+
+       pci_set_drvdata(pdev, ndev);
+
+       rc = pci_enable_device(pdev);
+       if (rc)
+               goto err_pci_enable;
+
+       rc = pci_request_regions(pdev, NTB_NAME);
+       if (rc)
+               goto err_pci_regions;
+
+       pci_set_master(pdev);
+
+       rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+       if (rc) {
+               rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+               if (rc)
+                       goto err_dma_mask;
+               dev_warn(ndev_dev(ndev), "Cannot DMA highmem\n");
+       }
+
+       rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+       if (rc) {
+               rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+               if (rc)
+                       goto err_dma_mask;
+               dev_warn(ndev_dev(ndev), "Cannot DMA consistent highmem\n");
+       }
+
+       ndev->self_mmio = pci_iomap(pdev, 0, 0);
+       if (!ndev->self_mmio) {
+               rc = -EIO;
+               goto err_dma_mask;
+       }
+       ndev->peer_mmio = ndev->self_mmio + AMD_PEER_OFFSET;
+
+       return 0;
+
+err_dma_mask:
+       pci_clear_master(pdev);
+err_pci_regions:
+       pci_disable_device(pdev);
+err_pci_enable:
+       pci_set_drvdata(pdev, NULL);
+       return rc;
+}
+
+static void amd_ntb_deinit_pci(struct amd_ntb_dev *ndev)
+{
+       struct pci_dev *pdev = ndev_pdev(ndev);
+
+       pci_iounmap(pdev, ndev->self_mmio);
+
+       pci_clear_master(pdev);
+       pci_release_regions(pdev);
+       pci_disable_device(pdev);
+       pci_set_drvdata(pdev, NULL);
+}
+
+static int amd_ntb_pci_probe(struct pci_dev *pdev,
+                            const struct pci_device_id *id)
+{
+       struct amd_ntb_dev *ndev;
+       int rc, node;
+
+       node = dev_to_node(&pdev->dev);
+
+       ndev = kzalloc_node(sizeof(*ndev), GFP_KERNEL, node);
+       if (!ndev) {
+               rc = -ENOMEM;
+               goto err_ndev;
+       }
+
+       ndev_init_struct(ndev, pdev);
+
+       rc = amd_ntb_init_pci(ndev, pdev);
+       if (rc)
+               goto err_init_pci;
+
+       rc = amd_init_dev(ndev);
+       if (rc)
+               goto err_init_dev;
+
+       /* write side info */
+       amd_init_side_info(ndev);
+
+       amd_poll_link(ndev);
+
+       ndev_init_debugfs(ndev);
+
+       rc = ntb_register_device(&ndev->ntb);
+       if (rc)
+               goto err_register;
+
+       dev_info(&pdev->dev, "NTB device registered.\n");
+
+       return 0;
+
+err_register:
+       ndev_deinit_debugfs(ndev);
+       amd_deinit_dev(ndev);
+err_init_dev:
+       amd_ntb_deinit_pci(ndev);
+err_init_pci:
+       kfree(ndev);
+err_ndev:
+       return rc;
+}
+
+static void amd_ntb_pci_remove(struct pci_dev *pdev)
+{
+       struct amd_ntb_dev *ndev = pci_get_drvdata(pdev);
+
+       ntb_unregister_device(&ndev->ntb);
+       ndev_deinit_debugfs(ndev);
+       amd_deinit_side_info(ndev);
+       amd_deinit_dev(ndev);
+       amd_ntb_deinit_pci(ndev);
+       kfree(ndev);
+}
+
+static const struct file_operations amd_ntb_debugfs_info = {
+       .owner = THIS_MODULE,
+       .open = simple_open,
+       .read = ndev_debugfs_read,
+};
+
+static const struct pci_device_id amd_ntb_pci_tbl[] = {
+       {PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_NTB)},
+       {0}
+};
+MODULE_DEVICE_TABLE(pci, amd_ntb_pci_tbl);
+
+static struct pci_driver amd_ntb_pci_driver = {
+       .name           = KBUILD_MODNAME,
+       .id_table       = amd_ntb_pci_tbl,
+       .probe          = amd_ntb_pci_probe,
+       .remove         = amd_ntb_pci_remove,
+};
+
+static int __init amd_ntb_pci_driver_init(void)
+{
+       pr_info("%s %s\n", NTB_DESC, NTB_VER);
+
+       if (debugfs_initialized())
+               debugfs_dir = debugfs_create_dir(KBUILD_MODNAME, NULL);
+
+       return pci_register_driver(&amd_ntb_pci_driver);
+}
+module_init(amd_ntb_pci_driver_init);
+
+static void __exit amd_ntb_pci_driver_exit(void)
+{
+       pci_unregister_driver(&amd_ntb_pci_driver);
+       debugfs_remove_recursive(debugfs_dir);
+}
+module_exit(amd_ntb_pci_driver_exit);
diff --git a/drivers/ntb/hw/amd/ntb_hw_amd.h b/drivers/ntb/hw/amd/ntb_hw_amd.h
new file mode 100644 (file)
index 0000000..2eac3cd
--- /dev/null
@@ -0,0 +1,217 @@
+/*
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ *   redistributing this file, you may do so under either license.
+ *
+ *   GPL LICENSE SUMMARY
+ *
+ *   Copyright (C) 2016 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of version 2 of the GNU General Public License as
+ *   published by the Free Software Foundation.
+ *
+ *   BSD LICENSE
+ *
+ *   Copyright (C) 2016 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copy
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of AMD Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * AMD PCIe NTB Linux driver
+ *
+ * Contact Information:
+ * Xiangliang Yu <Xiangliang.Yu@amd.com>
+ */
+
+#ifndef NTB_HW_AMD_H
+#define NTB_HW_AMD_H
+
+#include <linux/ntb.h>
+#include <linux/pci.h>
+
+#define PCI_DEVICE_ID_AMD_NTB  0x145B
+#define AMD_LINK_HB_TIMEOUT    msecs_to_jiffies(1000)
+#define AMD_LINK_STATUS_OFFSET 0x68
+#define NTB_LIN_STA_ACTIVE_BIT 0x00000002
+#define NTB_LNK_STA_SPEED_MASK 0x000F0000
+#define NTB_LNK_STA_WIDTH_MASK 0x03F00000
+#define NTB_LNK_STA_ACTIVE(x)  (!!((x) & NTB_LIN_STA_ACTIVE_BIT))
+#define NTB_LNK_STA_SPEED(x)   (((x) & NTB_LNK_STA_SPEED_MASK) >> 16)
+#define NTB_LNK_STA_WIDTH(x)   (((x) & NTB_LNK_STA_WIDTH_MASK) >> 20)
+
+#ifndef read64
+#ifdef readq
+#define read64 readq
+#else
+#define read64 _read64
+static inline u64 _read64(void __iomem *mmio)
+{
+       u64 low, high;
+
+       low = readl(mmio);
+       high = readl(mmio + sizeof(u32));
+       return low | (high << 32);
+}
+#endif
+#endif
+
+#ifndef write64
+#ifdef writeq
+#define write64 writeq
+#else
+#define write64 _write64
+static inline void _write64(u64 val, void __iomem *mmio)
+{
+       writel(val, mmio);
+       writel(val >> 32, mmio + sizeof(u32));
+}
+#endif
+#endif
+
+enum {
+       /* AMD NTB Capability */
+       AMD_MW_CNT              = 3,
+       AMD_DB_CNT              = 16,
+       AMD_MSIX_VECTOR_CNT     = 24,
+       AMD_SPADS_CNT           = 16,
+
+       /*  AMD NTB register offset */
+       AMD_CNTL_OFFSET         = 0x200,
+
+       /* NTB control register bits */
+       PMM_REG_CTL             = BIT(21),
+       SMM_REG_CTL             = BIT(20),
+       SMM_REG_ACC_PATH        = BIT(18),
+       PMM_REG_ACC_PATH        = BIT(17),
+       NTB_CLK_EN              = BIT(16),
+
+       AMD_STA_OFFSET          = 0x204,
+       AMD_PGSLV_OFFSET        = 0x208,
+       AMD_SPAD_MUX_OFFSET     = 0x20C,
+       AMD_SPAD_OFFSET         = 0x210,
+       AMD_RSMU_HCID           = 0x250,
+       AMD_RSMU_SIID           = 0x254,
+       AMD_PSION_OFFSET        = 0x300,
+       AMD_SSION_OFFSET        = 0x330,
+       AMD_MMINDEX_OFFSET      = 0x400,
+       AMD_MMDATA_OFFSET       = 0x404,
+       AMD_SIDEINFO_OFFSET     = 0x408,
+
+       AMD_SIDE_MASK           = BIT(0),
+       AMD_SIDE_READY          = BIT(1),
+
+       /* limit register */
+       AMD_ROMBARLMT_OFFSET    = 0x410,
+       AMD_BAR1LMT_OFFSET      = 0x414,
+       AMD_BAR23LMT_OFFSET     = 0x418,
+       AMD_BAR45LMT_OFFSET     = 0x420,
+       /* xlat address */
+       AMD_POMBARXLAT_OFFSET   = 0x428,
+       AMD_BAR1XLAT_OFFSET     = 0x430,
+       AMD_BAR23XLAT_OFFSET    = 0x438,
+       AMD_BAR45XLAT_OFFSET    = 0x440,
+       /* doorbell and interrupt */
+       AMD_DBFM_OFFSET         = 0x450,
+       AMD_DBREQ_OFFSET        = 0x454,
+       AMD_MIRRDBSTAT_OFFSET   = 0x458,
+       AMD_DBMASK_OFFSET       = 0x45C,
+       AMD_DBSTAT_OFFSET       = 0x460,
+       AMD_INTMASK_OFFSET      = 0x470,
+       AMD_INTSTAT_OFFSET      = 0x474,
+
+       /* event type */
+       AMD_PEER_FLUSH_EVENT    = BIT(0),
+       AMD_PEER_RESET_EVENT    = BIT(1),
+       AMD_PEER_D3_EVENT       = BIT(2),
+       AMD_PEER_PMETO_EVENT    = BIT(3),
+       AMD_PEER_D0_EVENT       = BIT(4),
+       AMD_EVENT_INTMASK       = (AMD_PEER_FLUSH_EVENT |
+                               AMD_PEER_RESET_EVENT | AMD_PEER_D3_EVENT |
+                               AMD_PEER_PMETO_EVENT | AMD_PEER_D0_EVENT),
+
+       AMD_PMESTAT_OFFSET      = 0x480,
+       AMD_PMSGTRIG_OFFSET     = 0x490,
+       AMD_LTRLATENCY_OFFSET   = 0x494,
+       AMD_FLUSHTRIG_OFFSET    = 0x498,
+
+       /* SMU register*/
+       AMD_SMUACK_OFFSET       = 0x4A0,
+       AMD_SINRST_OFFSET       = 0x4A4,
+       AMD_RSPNUM_OFFSET       = 0x4A8,
+       AMD_SMU_SPADMUTEX       = 0x4B0,
+       AMD_SMU_SPADOFFSET      = 0x4B4,
+
+       AMD_PEER_OFFSET         = 0x400,
+};
+
+struct amd_ntb_dev;
+
+struct amd_ntb_vec {
+       struct amd_ntb_dev      *ndev;
+       int                     num;
+};
+
+struct amd_ntb_dev {
+       struct ntb_dev ntb;
+
+       u32 ntb_side;
+       u32 lnk_sta;
+       u32 cntl_sta;
+       u32 peer_sta;
+
+       unsigned char mw_count;
+       unsigned char spad_count;
+       unsigned char db_count;
+       unsigned char msix_vec_count;
+
+       u64 db_valid_mask;
+       u64 db_mask;
+       u32 int_mask;
+
+       struct msix_entry *msix;
+       struct amd_ntb_vec *vec;
+
+       /* synchronize rmw access of db_mask and hw reg */
+       spinlock_t db_mask_lock;
+
+       void __iomem *self_mmio;
+       void __iomem *peer_mmio;
+       unsigned int self_spad;
+       unsigned int peer_spad;
+
+       struct delayed_work hb_timer;
+
+       struct dentry *debugfs_dir;
+       struct dentry *debugfs_info;
+};
+
+#define ndev_pdev(ndev) ((ndev)->ntb.pdev)
+#define ndev_name(ndev) pci_name(ndev_pdev(ndev))
+#define ndev_dev(ndev) (&ndev_pdev(ndev)->dev)
+#define ntb_ndev(__ntb) container_of(__ntb, struct amd_ntb_dev, ntb)
+#define hb_ndev(__work) container_of(__work, struct amd_ntb_dev, hb_timer.work)
+
+#endif
index a198f82982582a155b230e3964db17b6e01479d1..40d04ef5da9e865c8ddecfe5acf431fef5976daa 100644 (file)
@@ -875,7 +875,7 @@ static int intel_ntb_mw_set_trans(struct ntb_dev *ntb, int idx,
        limit_reg = bar2_off(ndev->xlat_reg->bar2_limit, bar);
 
        if (bar < 4 || !ndev->bar4_split) {
-               base = ioread64(mmio + base_reg);
+               base = ioread64(mmio + base_reg) & NTB_BAR_MASK_64;
 
                /* Set the limit if supported, if size is not mw_size */
                if (limit_reg && size != mw_size)
@@ -906,7 +906,7 @@ static int intel_ntb_mw_set_trans(struct ntb_dev *ntb, int idx,
                if ((addr + size) & (~0ull << 32))
                        return -EINVAL;
 
-               base = ioread32(mmio + base_reg);
+               base = ioread32(mmio + base_reg) & NTB_BAR_MASK_32;
 
                /* Set the limit if supported, if size is not mw_size */
                if (limit_reg && size != mw_size)
index 2eb4addd10d082c03184469619b6bafece08ddde..3ec149cf656276dd362fad25b37661b69d5fb3d0 100644 (file)
 #define NTB_UNSAFE_DB                  BIT_ULL(0)
 #define NTB_UNSAFE_SPAD                        BIT_ULL(1)
 
+#define NTB_BAR_MASK_64                        ~(0xfull)
+#define NTB_BAR_MASK_32                        ~(0xfu)
+
 struct intel_ntb_dev;
 
 struct intel_ntb_reg {
@@ -334,7 +337,8 @@ struct intel_ntb_dev {
 #define ndev_pdev(ndev) ((ndev)->ntb.pdev)
 #define ndev_name(ndev) pci_name(ndev_pdev(ndev))
 #define ndev_dev(ndev) (&ndev_pdev(ndev)->dev)
-#define ntb_ndev(ntb) container_of(ntb, struct intel_ntb_dev, ntb)
-#define hb_ndev(work) container_of(work, struct intel_ntb_dev, hb_timer.work)
+#define ntb_ndev(__ntb) container_of(__ntb, struct intel_ntb_dev, ntb)
+#define hb_ndev(__work) container_of(__work, struct intel_ntb_dev, \
+                                    hb_timer.work)
 
 #endif
index 60654d524858c4bf52f6c5b5668c90719f523eda..ec4775f0ec163a2ac4e3d89851bedfb4a0d52db1 100644 (file)
@@ -171,12 +171,14 @@ struct ntb_transport_qp {
        u64 rx_err_ver;
        u64 rx_memcpy;
        u64 rx_async;
+       u64 dma_rx_prep_err;
        u64 tx_bytes;
        u64 tx_pkts;
        u64 tx_ring_full;
        u64 tx_err_no_buf;
        u64 tx_memcpy;
        u64 tx_async;
+       u64 dma_tx_prep_err;
 };
 
 struct ntb_transport_mw {
@@ -249,6 +251,8 @@ enum {
 #define QP_TO_MW(nt, qp)       ((qp) % nt->mw_count)
 #define NTB_QP_DEF_NUM_ENTRIES 100
 #define NTB_LINK_DOWN_TIMEOUT  10
+#define DMA_RETRIES            20
+#define DMA_OUT_RESOURCE_TO    50
 
 static void ntb_transport_rxc_db(unsigned long data);
 static const struct ntb_ctx_ops ntb_transport_ops;
@@ -501,6 +505,12 @@ static ssize_t debugfs_read(struct file *filp, char __user *ubuf, size_t count,
        out_offset += snprintf(buf + out_offset, out_count - out_offset,
                               "free tx - \t%u\n",
                               ntb_transport_tx_free_entry(qp));
+       out_offset += snprintf(buf + out_offset, out_count - out_offset,
+                              "DMA tx prep err - \t%llu\n",
+                              qp->dma_tx_prep_err);
+       out_offset += snprintf(buf + out_offset, out_count - out_offset,
+                              "DMA rx prep err - \t%llu\n",
+                              qp->dma_rx_prep_err);
 
        out_offset += snprintf(buf + out_offset, out_count - out_offset,
                               "\n");
@@ -726,6 +736,8 @@ static void ntb_qp_link_down_reset(struct ntb_transport_qp *qp)
        qp->tx_err_no_buf = 0;
        qp->tx_memcpy = 0;
        qp->tx_async = 0;
+       qp->dma_tx_prep_err = 0;
+       qp->dma_rx_prep_err = 0;
 }
 
 static void ntb_qp_link_cleanup(struct ntb_transport_qp *qp)
@@ -1228,6 +1240,7 @@ static void ntb_async_rx(struct ntb_queue_entry *entry, void *offset)
        struct dmaengine_unmap_data *unmap;
        dma_cookie_t cookie;
        void *buf = entry->buf;
+       int retries = 0;
 
        len = entry->len;
 
@@ -1263,11 +1276,21 @@ static void ntb_async_rx(struct ntb_queue_entry *entry, void *offset)
 
        unmap->from_cnt = 1;
 
-       txd = device->device_prep_dma_memcpy(chan, unmap->addr[1],
-                                            unmap->addr[0], len,
-                                            DMA_PREP_INTERRUPT);
-       if (!txd)
+       for (retries = 0; retries < DMA_RETRIES; retries++) {
+               txd = device->device_prep_dma_memcpy(chan, unmap->addr[1],
+                                                    unmap->addr[0], len,
+                                                    DMA_PREP_INTERRUPT);
+               if (txd)
+                       break;
+
+               set_current_state(TASK_INTERRUPTIBLE);
+               schedule_timeout(DMA_OUT_RESOURCE_TO);
+       }
+
+       if (!txd) {
+               qp->dma_rx_prep_err++;
                goto err_get_unmap;
+       }
 
        txd->callback = ntb_rx_copy_callback;
        txd->callback_param = entry;
@@ -1460,6 +1483,7 @@ static void ntb_async_tx(struct ntb_transport_qp *qp,
        void __iomem *offset;
        size_t len = entry->len;
        void *buf = entry->buf;
+       int retries = 0;
 
        offset = qp->tx_mw + qp->tx_max_frame * qp->tx_index;
        hdr = offset + qp->tx_max_frame - sizeof(struct ntb_payload_header);
@@ -1494,10 +1518,20 @@ static void ntb_async_tx(struct ntb_transport_qp *qp,
 
        unmap->to_cnt = 1;
 
-       txd = device->device_prep_dma_memcpy(chan, dest, unmap->addr[0], len,
-                                            DMA_PREP_INTERRUPT);
-       if (!txd)
+       for (retries = 0; retries < DMA_RETRIES; retries++) {
+               txd = device->device_prep_dma_memcpy(chan, dest, unmap->addr[0],
+                                                    len, DMA_PREP_INTERRUPT);
+               if (txd)
+                       break;
+
+               set_current_state(TASK_INTERRUPTIBLE);
+               schedule_timeout(DMA_OUT_RESOURCE_TO);
+       }
+
+       if (!txd) {
+               qp->dma_tx_prep_err++;
                goto err_get_unmap;
+       }
 
        txd->callback = ntb_tx_copy_callback;
        txd->callback_param = entry;
@@ -1532,7 +1566,7 @@ static int ntb_process_tx(struct ntb_transport_qp *qp,
 
        if (entry->len > qp->tx_max_frame - sizeof(struct ntb_payload_header)) {
                if (qp->tx_handler)
-                       qp->tx_handler(qp->cb_data, qp, NULL, -EIO);
+                       qp->tx_handler(qp, qp->cb_data, NULL, -EIO);
 
                ntb_list_add(&qp->ntb_tx_free_q_lock, &entry->entry,
                             &qp->tx_free_q);
index 01852f98a843db8b9d04e4a9e5e115bce756a4b0..a5d0eda44438eefec4b88bad9d31afa484da4c6d 100644 (file)
@@ -17,3 +17,11 @@ config NTB_TOOL
         functioning at a basic level.
 
         If unsure, say N.
+
+config NTB_PERF
+       tristate "NTB RAW Perf Measuring Tool"
+       help
+        This is a tool to measure raw NTB performance by transferring data
+        to and from the window without additional software interaction.
+
+        If unsure, say N.
index 0ea32a324b6ca206a6f9beeaa4b86c75f0cab3e1..9e77e0b761c24fd45c6f435305e2b52e1705edee 100644 (file)
@@ -1,2 +1,3 @@
 obj-$(CONFIG_NTB_PINGPONG) += ntb_pingpong.o
 obj-$(CONFIG_NTB_TOOL) += ntb_tool.o
+obj-$(CONFIG_NTB_PERF) += ntb_perf.o
diff --git a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c
new file mode 100644 (file)
index 0000000..c8a37ba
--- /dev/null
@@ -0,0 +1,748 @@
+/*
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ *   redistributing this file, you may do so under either license.
+ *
+ *   GPL LICENSE SUMMARY
+ *
+ *   Copyright(c) 2015 Intel Corporation. All rights reserved.
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of version 2 of the GNU General Public License as
+ *   published by the Free Software Foundation.
+ *
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2015 Intel Corporation. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copy
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *   PCIe NTB Perf Linux driver
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/time.h>
+#include <linux/timer.h>
+#include <linux/dma-mapping.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/dmaengine.h>
+#include <linux/delay.h>
+#include <linux/sizes.h>
+#include <linux/ntb.h>
+
+#define DRIVER_NAME            "ntb_perf"
+#define DRIVER_DESCRIPTION     "PCIe NTB Performance Measurement Tool"
+
+#define DRIVER_LICENSE         "Dual BSD/GPL"
+#define DRIVER_VERSION         "1.0"
+#define DRIVER_AUTHOR          "Dave Jiang <dave.jiang@intel.com>"
+
+#define PERF_LINK_DOWN_TIMEOUT 10
+#define PERF_VERSION           0xffff0001
+#define MAX_THREADS            32
+#define MAX_TEST_SIZE          SZ_1M
+#define MAX_SRCS               32
+#define DMA_OUT_RESOURCE_TO    50
+#define DMA_RETRIES            20
+#define SZ_4G                  (1ULL << 32)
+#define MAX_SEG_ORDER          20 /* no larger than 1M for kmalloc buffer */
+
+MODULE_LICENSE(DRIVER_LICENSE);
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESCRIPTION);
+
+static struct dentry *perf_debugfs_dir;
+
+static unsigned int seg_order = 19; /* 512K */
+module_param(seg_order, uint, 0644);
+MODULE_PARM_DESC(seg_order, "size order [n^2] of buffer segment for testing");
+
+static unsigned int run_order = 32; /* 4G */
+module_param(run_order, uint, 0644);
+MODULE_PARM_DESC(run_order, "size order [n^2] of total data to transfer");
+
+static bool use_dma; /* default to 0 */
+module_param(use_dma, bool, 0644);
+MODULE_PARM_DESC(use_dma, "Using DMA engine to measure performance");
+
+struct perf_mw {
+       phys_addr_t     phys_addr;
+       resource_size_t phys_size;
+       resource_size_t xlat_align;
+       resource_size_t xlat_align_size;
+       void __iomem    *vbase;
+       size_t          xlat_size;
+       size_t          buf_size;
+       void            *virt_addr;
+       dma_addr_t      dma_addr;
+};
+
+struct perf_ctx;
+
+struct pthr_ctx {
+       struct task_struct      *thread;
+       struct perf_ctx         *perf;
+       atomic_t                dma_sync;
+       struct dma_chan         *dma_chan;
+       int                     dma_prep_err;
+       int                     src_idx;
+       void                    *srcs[MAX_SRCS];
+};
+
+struct perf_ctx {
+       struct ntb_dev          *ntb;
+       spinlock_t              db_lock;
+       struct perf_mw          mw;
+       bool                    link_is_up;
+       struct work_struct      link_cleanup;
+       struct delayed_work     link_work;
+       struct dentry           *debugfs_node_dir;
+       struct dentry           *debugfs_run;
+       struct dentry           *debugfs_threads;
+       u8                      perf_threads;
+       bool                    run;
+       struct pthr_ctx         pthr_ctx[MAX_THREADS];
+       atomic_t                tsync;
+};
+
+enum {
+       VERSION = 0,
+       MW_SZ_HIGH,
+       MW_SZ_LOW,
+       SPAD_MSG,
+       SPAD_ACK,
+       MAX_SPAD
+};
+
+static void perf_link_event(void *ctx)
+{
+       struct perf_ctx *perf = ctx;
+
+       if (ntb_link_is_up(perf->ntb, NULL, NULL) == 1)
+               schedule_delayed_work(&perf->link_work, 2*HZ);
+       else
+               schedule_work(&perf->link_cleanup);
+}
+
+static void perf_db_event(void *ctx, int vec)
+{
+       struct perf_ctx *perf = ctx;
+       u64 db_bits, db_mask;
+
+       db_mask = ntb_db_vector_mask(perf->ntb, vec);
+       db_bits = ntb_db_read(perf->ntb);
+
+       dev_dbg(&perf->ntb->dev, "doorbell vec %d mask %#llx bits %#llx\n",
+               vec, db_mask, db_bits);
+}
+
+static const struct ntb_ctx_ops perf_ops = {
+       .link_event = perf_link_event,
+       .db_event = perf_db_event,
+};
+
+static void perf_copy_callback(void *data)
+{
+       struct pthr_ctx *pctx = data;
+
+       atomic_dec(&pctx->dma_sync);
+}
+
+static ssize_t perf_copy(struct pthr_ctx *pctx, char *dst,
+                        char *src, size_t size)
+{
+       struct perf_ctx *perf = pctx->perf;
+       struct dma_async_tx_descriptor *txd;
+       struct dma_chan *chan = pctx->dma_chan;
+       struct dma_device *device;
+       struct dmaengine_unmap_data *unmap;
+       dma_cookie_t cookie;
+       size_t src_off, dst_off;
+       struct perf_mw *mw = &perf->mw;
+       u64 vbase, dst_vaddr;
+       dma_addr_t dst_phys;
+       int retries = 0;
+
+       if (!use_dma) {
+               memcpy_toio(dst, src, size);
+               return size;
+       }
+
+       if (!chan) {
+               dev_err(&perf->ntb->dev, "DMA engine does not exist\n");
+               return -EINVAL;
+       }
+
+       device = chan->device;
+       src_off = (size_t)src & ~PAGE_MASK;
+       dst_off = (size_t)dst & ~PAGE_MASK;
+
+       if (!is_dma_copy_aligned(device, src_off, dst_off, size))
+               return -ENODEV;
+
+       vbase = (u64)(u64 *)mw->vbase;
+       dst_vaddr = (u64)(u64 *)dst;
+       dst_phys = mw->phys_addr + (dst_vaddr - vbase);
+
+       unmap = dmaengine_get_unmap_data(device->dev, 1, GFP_NOWAIT);
+       if (!unmap)
+               return -ENOMEM;
+
+       unmap->len = size;
+       unmap->addr[0] = dma_map_page(device->dev, virt_to_page(src),
+                                     src_off, size, DMA_TO_DEVICE);
+       if (dma_mapping_error(device->dev, unmap->addr[0]))
+               goto err_get_unmap;
+
+       unmap->to_cnt = 1;
+
+       do {
+               txd = device->device_prep_dma_memcpy(chan, dst_phys,
+                                                    unmap->addr[0],
+                                                    size, DMA_PREP_INTERRUPT);
+               if (!txd) {
+                       set_current_state(TASK_INTERRUPTIBLE);
+                       schedule_timeout(DMA_OUT_RESOURCE_TO);
+               }
+       } while (!txd && (++retries < DMA_RETRIES));
+
+       if (!txd) {
+               pctx->dma_prep_err++;
+               goto err_get_unmap;
+       }
+
+       txd->callback = perf_copy_callback;
+       txd->callback_param = pctx;
+       dma_set_unmap(txd, unmap);
+
+       cookie = dmaengine_submit(txd);
+       if (dma_submit_error(cookie))
+               goto err_set_unmap;
+
+       atomic_inc(&pctx->dma_sync);
+       dma_async_issue_pending(chan);
+
+       return size;
+
+err_set_unmap:
+       dmaengine_unmap_put(unmap);
+err_get_unmap:
+       dmaengine_unmap_put(unmap);
+       return 0;
+}
+
+static int perf_move_data(struct pthr_ctx *pctx, char *dst, char *src,
+                         u64 buf_size, u64 win_size, u64 total)
+{
+       int chunks, total_chunks, i;
+       int copied_chunks = 0;
+       u64 copied = 0, result;
+       char *tmp = dst;
+       u64 perf, diff_us;
+       ktime_t kstart, kstop, kdiff;
+
+       chunks = div64_u64(win_size, buf_size);
+       total_chunks = div64_u64(total, buf_size);
+       kstart = ktime_get();
+
+       for (i = 0; i < total_chunks; i++) {
+               result = perf_copy(pctx, tmp, src, buf_size);
+               copied += result;
+               copied_chunks++;
+               if (copied_chunks == chunks) {
+                       tmp = dst;
+                       copied_chunks = 0;
+               } else
+                       tmp += buf_size;
+
+               /* Probably should schedule every 4GB to prevent soft hang. */
+               if (((copied % SZ_4G) == 0) && !use_dma) {
+                       set_current_state(TASK_INTERRUPTIBLE);
+                       schedule_timeout(1);
+               }
+       }
+
+       if (use_dma) {
+               pr_info("%s: All DMA descriptors submitted\n", current->comm);
+               while (atomic_read(&pctx->dma_sync) != 0)
+                       msleep(20);
+       }
+
+       kstop = ktime_get();
+       kdiff = ktime_sub(kstop, kstart);
+       diff_us = ktime_to_us(kdiff);
+
+       pr_info("%s: copied %llu bytes\n", current->comm, copied);
+
+       pr_info("%s: lasted %llu usecs\n", current->comm, diff_us);
+
+       perf = div64_u64(copied, diff_us);
+
+       pr_info("%s: MBytes/s: %llu\n", current->comm, perf);
+
+       return 0;
+}
+
+static bool perf_dma_filter_fn(struct dma_chan *chan, void *node)
+{
+       return dev_to_node(&chan->dev->device) == (int)(unsigned long)node;
+}
+
+static int ntb_perf_thread(void *data)
+{
+       struct pthr_ctx *pctx = data;
+       struct perf_ctx *perf = pctx->perf;
+       struct pci_dev *pdev = perf->ntb->pdev;
+       struct perf_mw *mw = &perf->mw;
+       char *dst;
+       u64 win_size, buf_size, total;
+       void *src;
+       int rc, node, i;
+       struct dma_chan *dma_chan = NULL;
+
+       pr_info("kthread %s starting...\n", current->comm);
+
+       node = dev_to_node(&pdev->dev);
+
+       if (use_dma && !pctx->dma_chan) {
+               dma_cap_mask_t dma_mask;
+
+               dma_cap_zero(dma_mask);
+               dma_cap_set(DMA_MEMCPY, dma_mask);
+               dma_chan = dma_request_channel(dma_mask, perf_dma_filter_fn,
+                                              (void *)(unsigned long)node);
+               if (!dma_chan) {
+                       pr_warn("%s: cannot acquire DMA channel, quitting\n",
+                               current->comm);
+                       return -ENODEV;
+               }
+               pctx->dma_chan = dma_chan;
+       }
+
+       for (i = 0; i < MAX_SRCS; i++) {
+               pctx->srcs[i] = kmalloc_node(MAX_TEST_SIZE, GFP_KERNEL, node);
+               if (!pctx->srcs[i]) {
+                       rc = -ENOMEM;
+                       goto err;
+               }
+       }
+
+       win_size = mw->phys_size;
+       buf_size = 1ULL << seg_order;
+       total = 1ULL << run_order;
+
+       if (buf_size > MAX_TEST_SIZE)
+               buf_size = MAX_TEST_SIZE;
+
+       dst = (char *)mw->vbase;
+
+       atomic_inc(&perf->tsync);
+       while (atomic_read(&perf->tsync) != perf->perf_threads)
+               schedule();
+
+       src = pctx->srcs[pctx->src_idx];
+       pctx->src_idx = (pctx->src_idx + 1) & (MAX_SRCS - 1);
+
+       rc = perf_move_data(pctx, dst, src, buf_size, win_size, total);
+
+       atomic_dec(&perf->tsync);
+
+       if (rc < 0) {
+               pr_err("%s: failed\n", current->comm);
+               rc = -ENXIO;
+               goto err;
+       }
+
+       for (i = 0; i < MAX_SRCS; i++) {
+               kfree(pctx->srcs[i]);
+               pctx->srcs[i] = NULL;
+       }
+
+       return 0;
+
+err:
+       for (i = 0; i < MAX_SRCS; i++) {
+               kfree(pctx->srcs[i]);
+               pctx->srcs[i] = NULL;
+       }
+
+       if (dma_chan) {
+               dma_release_channel(dma_chan);
+               pctx->dma_chan = NULL;
+       }
+
+       return rc;
+}
+
+static void perf_free_mw(struct perf_ctx *perf)
+{
+       struct perf_mw *mw = &perf->mw;
+       struct pci_dev *pdev = perf->ntb->pdev;
+
+       if (!mw->virt_addr)
+               return;
+
+       ntb_mw_clear_trans(perf->ntb, 0);
+       dma_free_coherent(&pdev->dev, mw->buf_size,
+                         mw->virt_addr, mw->dma_addr);
+       mw->xlat_size = 0;
+       mw->buf_size = 0;
+       mw->virt_addr = NULL;
+}
+
+static int perf_set_mw(struct perf_ctx *perf, resource_size_t size)
+{
+       struct perf_mw *mw = &perf->mw;
+       size_t xlat_size, buf_size;
+
+       if (!size)
+               return -EINVAL;
+
+       xlat_size = round_up(size, mw->xlat_align_size);
+       buf_size = round_up(size, mw->xlat_align);
+
+       if (mw->xlat_size == xlat_size)
+               return 0;
+
+       if (mw->buf_size)
+               perf_free_mw(perf);
+
+       mw->xlat_size = xlat_size;
+       mw->buf_size = buf_size;
+
+       mw->virt_addr = dma_alloc_coherent(&perf->ntb->pdev->dev, buf_size,
+                                          &mw->dma_addr, GFP_KERNEL);
+       if (!mw->virt_addr) {
+               mw->xlat_size = 0;
+               mw->buf_size = 0;
+       }
+
+       return 0;
+}
+
+static void perf_link_work(struct work_struct *work)
+{
+       struct perf_ctx *perf =
+               container_of(work, struct perf_ctx, link_work.work);
+       struct ntb_dev *ndev = perf->ntb;
+       struct pci_dev *pdev = ndev->pdev;
+       u32 val;
+       u64 size;
+       int rc;
+
+       dev_dbg(&perf->ntb->pdev->dev, "%s called\n", __func__);
+
+       size = perf->mw.phys_size;
+       ntb_peer_spad_write(ndev, MW_SZ_HIGH, upper_32_bits(size));
+       ntb_peer_spad_write(ndev, MW_SZ_LOW, lower_32_bits(size));
+       ntb_peer_spad_write(ndev, VERSION, PERF_VERSION);
+
+       /* now read what peer wrote */
+       val = ntb_spad_read(ndev, VERSION);
+       if (val != PERF_VERSION) {
+               dev_dbg(&pdev->dev, "Remote version = %#x\n", val);
+               goto out;
+       }
+
+       val = ntb_spad_read(ndev, MW_SZ_HIGH);
+       size = (u64)val << 32;
+
+       val = ntb_spad_read(ndev, MW_SZ_LOW);
+       size |= val;
+
+       dev_dbg(&pdev->dev, "Remote MW size = %#llx\n", size);
+
+       rc = perf_set_mw(perf, size);
+       if (rc)
+               goto out1;
+
+       perf->link_is_up = true;
+
+       return;
+
+out1:
+       perf_free_mw(perf);
+
+out:
+       if (ntb_link_is_up(ndev, NULL, NULL) == 1)
+               schedule_delayed_work(&perf->link_work,
+                                     msecs_to_jiffies(PERF_LINK_DOWN_TIMEOUT));
+}
+
+static void perf_link_cleanup(struct work_struct *work)
+{
+       struct perf_ctx *perf = container_of(work,
+                                            struct perf_ctx,
+                                            link_cleanup);
+
+       dev_dbg(&perf->ntb->pdev->dev, "%s called\n", __func__);
+
+       if (!perf->link_is_up)
+               cancel_delayed_work_sync(&perf->link_work);
+}
+
+static int perf_setup_mw(struct ntb_dev *ntb, struct perf_ctx *perf)
+{
+       struct perf_mw *mw;
+       int rc;
+
+       mw = &perf->mw;
+
+       rc = ntb_mw_get_range(ntb, 0, &mw->phys_addr, &mw->phys_size,
+                             &mw->xlat_align, &mw->xlat_align_size);
+       if (rc)
+               return rc;
+
+       perf->mw.vbase = ioremap_wc(mw->phys_addr, mw->phys_size);
+       if (!mw->vbase)
+               return -ENOMEM;
+
+       return 0;
+}
+
+static ssize_t debugfs_run_read(struct file *filp, char __user *ubuf,
+                               size_t count, loff_t *offp)
+{
+       struct perf_ctx *perf = filp->private_data;
+       char *buf;
+       ssize_t ret, out_offset;
+
+       if (!perf)
+               return 0;
+
+       buf = kmalloc(64, GFP_KERNEL);
+       out_offset = snprintf(buf, 64, "%d\n", perf->run);
+       ret = simple_read_from_buffer(ubuf, count, offp, buf, out_offset);
+       kfree(buf);
+
+       return ret;
+}
+
+static ssize_t debugfs_run_write(struct file *filp, const char __user *ubuf,
+                                size_t count, loff_t *offp)
+{
+       struct perf_ctx *perf = filp->private_data;
+       int node, i;
+
+       if (!perf->link_is_up)
+               return 0;
+
+       if (perf->perf_threads == 0)
+               return 0;
+
+       if (atomic_read(&perf->tsync) == 0)
+               perf->run = false;
+
+       if (perf->run) {
+               /* lets stop the threads */
+               perf->run = false;
+               for (i = 0; i < MAX_THREADS; i++) {
+                       if (perf->pthr_ctx[i].thread) {
+                               kthread_stop(perf->pthr_ctx[i].thread);
+                               perf->pthr_ctx[i].thread = NULL;
+                       } else
+                               break;
+               }
+       } else {
+               perf->run = true;
+
+               if (perf->perf_threads > MAX_THREADS) {
+                       perf->perf_threads = MAX_THREADS;
+                       pr_info("Reset total threads to: %u\n", MAX_THREADS);
+               }
+
+               /* no greater than 1M */
+               if (seg_order > MAX_SEG_ORDER) {
+                       seg_order = MAX_SEG_ORDER;
+                       pr_info("Fix seg_order to %u\n", seg_order);
+               }
+
+               if (run_order < seg_order) {
+                       run_order = seg_order;
+                       pr_info("Fix run_order to %u\n", run_order);
+               }
+
+               node = dev_to_node(&perf->ntb->pdev->dev);
+               /* launch kernel thread */
+               for (i = 0; i < perf->perf_threads; i++) {
+                       struct pthr_ctx *pctx;
+
+                       pctx = &perf->pthr_ctx[i];
+                       atomic_set(&pctx->dma_sync, 0);
+                       pctx->perf = perf;
+                       pctx->thread =
+                               kthread_create_on_node(ntb_perf_thread,
+                                                      (void *)pctx,
+                                                      node, "ntb_perf %d", i);
+                       if (pctx->thread)
+                               wake_up_process(pctx->thread);
+                       else {
+                               perf->run = false;
+                               for (i = 0; i < MAX_THREADS; i++) {
+                                       if (pctx->thread) {
+                                               kthread_stop(pctx->thread);
+                                               pctx->thread = NULL;
+                                       }
+                               }
+                       }
+
+                       if (perf->run == false)
+                               return -ENXIO;
+               }
+
+       }
+
+       return count;
+}
+
+static const struct file_operations ntb_perf_debugfs_run = {
+       .owner = THIS_MODULE,
+       .open = simple_open,
+       .read = debugfs_run_read,
+       .write = debugfs_run_write,
+};
+
+static int perf_debugfs_setup(struct perf_ctx *perf)
+{
+       struct pci_dev *pdev = perf->ntb->pdev;
+
+       if (!debugfs_initialized())
+               return -ENODEV;
+
+       if (!perf_debugfs_dir) {
+               perf_debugfs_dir = debugfs_create_dir(KBUILD_MODNAME, NULL);
+               if (!perf_debugfs_dir)
+                       return -ENODEV;
+       }
+
+       perf->debugfs_node_dir = debugfs_create_dir(pci_name(pdev),
+                                                   perf_debugfs_dir);
+       if (!perf->debugfs_node_dir)
+               return -ENODEV;
+
+       perf->debugfs_run = debugfs_create_file("run", S_IRUSR | S_IWUSR,
+                                               perf->debugfs_node_dir, perf,
+                                               &ntb_perf_debugfs_run);
+       if (!perf->debugfs_run)
+               return -ENODEV;
+
+       perf->debugfs_threads = debugfs_create_u8("threads", S_IRUSR | S_IWUSR,
+                                                 perf->debugfs_node_dir,
+                                                 &perf->perf_threads);
+       if (!perf->debugfs_threads)
+               return -ENODEV;
+
+       return 0;
+}
+
+static int perf_probe(struct ntb_client *client, struct ntb_dev *ntb)
+{
+       struct pci_dev *pdev = ntb->pdev;
+       struct perf_ctx *perf;
+       int node;
+       int rc = 0;
+
+       node = dev_to_node(&pdev->dev);
+
+       perf = kzalloc_node(sizeof(*perf), GFP_KERNEL, node);
+       if (!perf) {
+               rc = -ENOMEM;
+               goto err_perf;
+       }
+
+       perf->ntb = ntb;
+       perf->perf_threads = 1;
+       atomic_set(&perf->tsync, 0);
+       perf->run = false;
+       spin_lock_init(&perf->db_lock);
+       perf_setup_mw(ntb, perf);
+       INIT_DELAYED_WORK(&perf->link_work, perf_link_work);
+       INIT_WORK(&perf->link_cleanup, perf_link_cleanup);
+
+       rc = ntb_set_ctx(ntb, perf, &perf_ops);
+       if (rc)
+               goto err_ctx;
+
+       perf->link_is_up = false;
+       ntb_link_enable(ntb, NTB_SPEED_AUTO, NTB_WIDTH_AUTO);
+       ntb_link_event(ntb);
+
+       rc = perf_debugfs_setup(perf);
+       if (rc)
+               goto err_ctx;
+
+       return 0;
+
+err_ctx:
+       cancel_delayed_work_sync(&perf->link_work);
+       cancel_work_sync(&perf->link_cleanup);
+       kfree(perf);
+err_perf:
+       return rc;
+}
+
+static void perf_remove(struct ntb_client *client, struct ntb_dev *ntb)
+{
+       struct perf_ctx *perf = ntb->ctx;
+       int i;
+
+       dev_dbg(&perf->ntb->dev, "%s called\n", __func__);
+
+       cancel_delayed_work_sync(&perf->link_work);
+       cancel_work_sync(&perf->link_cleanup);
+
+       ntb_clear_ctx(ntb);
+       ntb_link_disable(ntb);
+
+       debugfs_remove_recursive(perf_debugfs_dir);
+       perf_debugfs_dir = NULL;
+
+       if (use_dma) {
+               for (i = 0; i < MAX_THREADS; i++) {
+                       struct pthr_ctx *pctx = &perf->pthr_ctx[i];
+
+                       if (pctx->dma_chan)
+                               dma_release_channel(pctx->dma_chan);
+               }
+       }
+
+       kfree(perf);
+}
+
+static struct ntb_client perf_client = {
+       .ops = {
+               .probe = perf_probe,
+               .remove = perf_remove,
+       },
+};
+module_ntb_client(perf_client);
index 002a94abdbc45c7f20255b804efae684185283c9..5d6237391dcd4e3851390abe9b1412217d2428d8 100644 (file)
@@ -8,3 +8,14 @@ config BLK_DEV_NVME
 
          To compile this driver as a module, choose M here: the
          module will be called nvme.
+
+config BLK_DEV_NVME_SCSI
+       bool "SCSI emulation for NVMe device nodes"
+       depends on BLK_DEV_NVME
+       ---help---
+         This adds support for the SG_IO ioctl on the NVMe character
+         and block devices nodes, as well a a translation for a small
+         number of selected SCSI commands to NVMe commands to the NVMe
+         driver.  If you don't know what this means you probably want
+         to say N here, and if you know what it means you probably
+         want to say N as well.
index a5fe239525868b1b86ec2d5337cfdb5e324424d6..51bf90871549844c7f4db638a7ec26c4b4b9aa51 100644 (file)
@@ -1,5 +1,6 @@
 
 obj-$(CONFIG_BLK_DEV_NVME)     += nvme.o
 
-lightnvm-$(CONFIG_NVM) := lightnvm.o
-nvme-y         += pci.o scsi.o $(lightnvm-y)
+lightnvm-$(CONFIG_NVM)                 := lightnvm.o
+nvme-y                                 += core.o pci.o $(lightnvm-y)
+nvme-$(CONFIG_BLK_DEV_NVME_SCSI)        += scsi.o
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
new file mode 100644 (file)
index 0000000..c5bf001
--- /dev/null
@@ -0,0 +1,1472 @@
+/*
+ * NVM Express device driver
+ * Copyright (c) 2011-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/hdreg.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/list_sort.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/pr.h>
+#include <linux/ptrace.h>
+#include <linux/nvme_ioctl.h>
+#include <linux/t10-pi.h>
+#include <scsi/sg.h>
+#include <asm/unaligned.h>
+
+#include "nvme.h"
+
+#define NVME_MINORS            (1U << MINORBITS)
+
+static int nvme_major;
+module_param(nvme_major, int, 0);
+
+static int nvme_char_major;
+module_param(nvme_char_major, int, 0);
+
+static LIST_HEAD(nvme_ctrl_list);
+DEFINE_SPINLOCK(dev_list_lock);
+
+static struct class *nvme_class;
+
+static void nvme_free_ns(struct kref *kref)
+{
+       struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
+
+       if (ns->type == NVME_NS_LIGHTNVM)
+               nvme_nvm_unregister(ns->queue, ns->disk->disk_name);
+
+       spin_lock(&dev_list_lock);
+       ns->disk->private_data = NULL;
+       spin_unlock(&dev_list_lock);
+
+       nvme_put_ctrl(ns->ctrl);
+       put_disk(ns->disk);
+       kfree(ns);
+}
+
+static void nvme_put_ns(struct nvme_ns *ns)
+{
+       kref_put(&ns->kref, nvme_free_ns);
+}
+
+static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk)
+{
+       struct nvme_ns *ns;
+
+       spin_lock(&dev_list_lock);
+       ns = disk->private_data;
+       if (ns && !kref_get_unless_zero(&ns->kref))
+               ns = NULL;
+       spin_unlock(&dev_list_lock);
+
+       return ns;
+}
+
+void nvme_requeue_req(struct request *req)
+{
+       unsigned long flags;
+
+       blk_mq_requeue_request(req);
+       spin_lock_irqsave(req->q->queue_lock, flags);
+       if (!blk_queue_stopped(req->q))
+               blk_mq_kick_requeue_list(req->q);
+       spin_unlock_irqrestore(req->q->queue_lock, flags);
+}
+
+struct request *nvme_alloc_request(struct request_queue *q,
+               struct nvme_command *cmd, unsigned int flags)
+{
+       bool write = cmd->common.opcode & 1;
+       struct request *req;
+
+       req = blk_mq_alloc_request(q, write, flags);
+       if (IS_ERR(req))
+               return req;
+
+       req->cmd_type = REQ_TYPE_DRV_PRIV;
+       req->cmd_flags |= REQ_FAILFAST_DRIVER;
+       req->__data_len = 0;
+       req->__sector = (sector_t) -1;
+       req->bio = req->biotail = NULL;
+
+       req->cmd = (unsigned char *)cmd;
+       req->cmd_len = sizeof(struct nvme_command);
+       req->special = (void *)0;
+
+       return req;
+}
+
+/*
+ * Returns 0 on success.  If the result is negative, it's a Linux error code;
+ * if the result is positive, it's an NVM Express status code
+ */
+int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
+               void *buffer, unsigned bufflen, u32 *result, unsigned timeout)
+{
+       struct request *req;
+       int ret;
+
+       req = nvme_alloc_request(q, cmd, 0);
+       if (IS_ERR(req))
+               return PTR_ERR(req);
+
+       req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
+
+       if (buffer && bufflen) {
+               ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
+               if (ret)
+                       goto out;
+       }
+
+       blk_execute_rq(req->q, NULL, req, 0);
+       if (result)
+               *result = (u32)(uintptr_t)req->special;
+       ret = req->errors;
+ out:
+       blk_mq_free_request(req);
+       return ret;
+}
+
+int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
+               void *buffer, unsigned bufflen)
+{
+       return __nvme_submit_sync_cmd(q, cmd, buffer, bufflen, NULL, 0);
+}
+
+int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
+               void __user *ubuffer, unsigned bufflen,
+               void __user *meta_buffer, unsigned meta_len, u32 meta_seed,
+               u32 *result, unsigned timeout)
+{
+       bool write = cmd->common.opcode & 1;
+       struct nvme_ns *ns = q->queuedata;
+       struct gendisk *disk = ns ? ns->disk : NULL;
+       struct request *req;
+       struct bio *bio = NULL;
+       void *meta = NULL;
+       int ret;
+
+       req = nvme_alloc_request(q, cmd, 0);
+       if (IS_ERR(req))
+               return PTR_ERR(req);
+
+       req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
+
+       if (ubuffer && bufflen) {
+               ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
+                               GFP_KERNEL);
+               if (ret)
+                       goto out;
+               bio = req->bio;
+
+               if (!disk)
+                       goto submit;
+               bio->bi_bdev = bdget_disk(disk, 0);
+               if (!bio->bi_bdev) {
+                       ret = -ENODEV;
+                       goto out_unmap;
+               }
+
+               if (meta_buffer) {
+                       struct bio_integrity_payload *bip;
+
+                       meta = kmalloc(meta_len, GFP_KERNEL);
+                       if (!meta) {
+                               ret = -ENOMEM;
+                               goto out_unmap;
+                       }
+
+                       if (write) {
+                               if (copy_from_user(meta, meta_buffer,
+                                               meta_len)) {
+                                       ret = -EFAULT;
+                                       goto out_free_meta;
+                               }
+                       }
+
+                       bip = bio_integrity_alloc(bio, GFP_KERNEL, 1);
+                       if (IS_ERR(bip)) {
+                               ret = PTR_ERR(bip);
+                               goto out_free_meta;
+                       }
+
+                       bip->bip_iter.bi_size = meta_len;
+                       bip->bip_iter.bi_sector = meta_seed;
+
+                       ret = bio_integrity_add_page(bio, virt_to_page(meta),
+                                       meta_len, offset_in_page(meta));
+                       if (ret != meta_len) {
+                               ret = -ENOMEM;
+                               goto out_free_meta;
+                       }
+               }
+       }
+ submit:
+       blk_execute_rq(req->q, disk, req, 0);
+       ret = req->errors;
+       if (result)
+               *result = (u32)(uintptr_t)req->special;
+       if (meta && !ret && !write) {
+               if (copy_to_user(meta_buffer, meta, meta_len))
+                       ret = -EFAULT;
+       }
+ out_free_meta:
+       kfree(meta);
+ out_unmap:
+       if (bio) {
+               if (disk && bio->bi_bdev)
+                       bdput(bio->bi_bdev);
+               blk_rq_unmap_user(bio);
+       }
+ out:
+       blk_mq_free_request(req);
+       return ret;
+}
+
+int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
+               void __user *ubuffer, unsigned bufflen, u32 *result,
+               unsigned timeout)
+{
+       return __nvme_submit_user_cmd(q, cmd, ubuffer, bufflen, NULL, 0, 0,
+                       result, timeout);
+}
+
+int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
+{
+       struct nvme_command c = { };
+       int error;
+
+       /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
+       c.identify.opcode = nvme_admin_identify;
+       c.identify.cns = cpu_to_le32(1);
+
+       *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
+       if (!*id)
+               return -ENOMEM;
+
+       error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
+                       sizeof(struct nvme_id_ctrl));
+       if (error)
+               kfree(*id);
+       return error;
+}
+
+static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list)
+{
+       struct nvme_command c = { };
+
+       c.identify.opcode = nvme_admin_identify;
+       c.identify.cns = cpu_to_le32(2);
+       c.identify.nsid = cpu_to_le32(nsid);
+       return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 0x1000);
+}
+
+int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid,
+               struct nvme_id_ns **id)
+{
+       struct nvme_command c = { };
+       int error;
+
+       /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
+       c.identify.opcode = nvme_admin_identify,
+       c.identify.nsid = cpu_to_le32(nsid),
+
+       *id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL);
+       if (!*id)
+               return -ENOMEM;
+
+       error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
+                       sizeof(struct nvme_id_ns));
+       if (error)
+               kfree(*id);
+       return error;
+}
+
+int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid,
+                                       dma_addr_t dma_addr, u32 *result)
+{
+       struct nvme_command c;
+
+       memset(&c, 0, sizeof(c));
+       c.features.opcode = nvme_admin_get_features;
+       c.features.nsid = cpu_to_le32(nsid);
+       c.features.prp1 = cpu_to_le64(dma_addr);
+       c.features.fid = cpu_to_le32(fid);
+
+       return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0, result, 0);
+}
+
+int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
+                                       dma_addr_t dma_addr, u32 *result)
+{
+       struct nvme_command c;
+
+       memset(&c, 0, sizeof(c));
+       c.features.opcode = nvme_admin_set_features;
+       c.features.prp1 = cpu_to_le64(dma_addr);
+       c.features.fid = cpu_to_le32(fid);
+       c.features.dword11 = cpu_to_le32(dword11);
+
+       return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0, result, 0);
+}
+
+int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log)
+{
+       struct nvme_command c = { };
+       int error;
+
+       c.common.opcode = nvme_admin_get_log_page,
+       c.common.nsid = cpu_to_le32(0xFFFFFFFF),
+       c.common.cdw10[0] = cpu_to_le32(
+                       (((sizeof(struct nvme_smart_log) / 4) - 1) << 16) |
+                        NVME_LOG_SMART),
+
+       *log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL);
+       if (!*log)
+               return -ENOMEM;
+
+       error = nvme_submit_sync_cmd(dev->admin_q, &c, *log,
+                       sizeof(struct nvme_smart_log));
+       if (error)
+               kfree(*log);
+       return error;
+}
+
+int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
+{
+       u32 q_count = (*count - 1) | ((*count - 1) << 16);
+       u32 result;
+       int status, nr_io_queues;
+
+       status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, 0,
+                       &result);
+       if (status)
+               return status;
+
+       nr_io_queues = min(result & 0xffff, result >> 16) + 1;
+       *count = min(*count, nr_io_queues);
+       return 0;
+}
+
+static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
+{
+       struct nvme_user_io io;
+       struct nvme_command c;
+       unsigned length, meta_len;
+       void __user *metadata;
+
+       if (copy_from_user(&io, uio, sizeof(io)))
+               return -EFAULT;
+
+       switch (io.opcode) {
+       case nvme_cmd_write:
+       case nvme_cmd_read:
+       case nvme_cmd_compare:
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       length = (io.nblocks + 1) << ns->lba_shift;
+       meta_len = (io.nblocks + 1) * ns->ms;
+       metadata = (void __user *)(uintptr_t)io.metadata;
+
+       if (ns->ext) {
+               length += meta_len;
+               meta_len = 0;
+       } else if (meta_len) {
+               if ((io.metadata & 3) || !io.metadata)
+                       return -EINVAL;
+       }
+
+       memset(&c, 0, sizeof(c));
+       c.rw.opcode = io.opcode;
+       c.rw.flags = io.flags;
+       c.rw.nsid = cpu_to_le32(ns->ns_id);
+       c.rw.slba = cpu_to_le64(io.slba);
+       c.rw.length = cpu_to_le16(io.nblocks);
+       c.rw.control = cpu_to_le16(io.control);
+       c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
+       c.rw.reftag = cpu_to_le32(io.reftag);
+       c.rw.apptag = cpu_to_le16(io.apptag);
+       c.rw.appmask = cpu_to_le16(io.appmask);
+
+       return __nvme_submit_user_cmd(ns->queue, &c,
+                       (void __user *)(uintptr_t)io.addr, length,
+                       metadata, meta_len, io.slba, NULL, 0);
+}
+
+static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
+                       struct nvme_passthru_cmd __user *ucmd)
+{
+       struct nvme_passthru_cmd cmd;
+       struct nvme_command c;
+       unsigned timeout = 0;
+       int status;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EACCES;
+       if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
+               return -EFAULT;
+
+       memset(&c, 0, sizeof(c));
+       c.common.opcode = cmd.opcode;
+       c.common.flags = cmd.flags;
+       c.common.nsid = cpu_to_le32(cmd.nsid);
+       c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
+       c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
+       c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
+       c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
+       c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
+       c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
+       c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
+       c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
+
+       if (cmd.timeout_ms)
+               timeout = msecs_to_jiffies(cmd.timeout_ms);
+
+       status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
+                       (void __user *)(uintptr_t)cmd.addr, cmd.data_len,
+                       &cmd.result, timeout);
+       if (status >= 0) {
+               if (put_user(cmd.result, &ucmd->result))
+                       return -EFAULT;
+       }
+
+       return status;
+}
+
+static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
+               unsigned int cmd, unsigned long arg)
+{
+       struct nvme_ns *ns = bdev->bd_disk->private_data;
+
+       switch (cmd) {
+       case NVME_IOCTL_ID:
+               force_successful_syscall_return();
+               return ns->ns_id;
+       case NVME_IOCTL_ADMIN_CMD:
+               return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg);
+       case NVME_IOCTL_IO_CMD:
+               return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg);
+       case NVME_IOCTL_SUBMIT_IO:
+               return nvme_submit_io(ns, (void __user *)arg);
+#ifdef CONFIG_BLK_DEV_NVME_SCSI
+       case SG_GET_VERSION_NUM:
+               return nvme_sg_get_version_num((void __user *)arg);
+       case SG_IO:
+               return nvme_sg_io(ns, (void __user *)arg);
+#endif
+       default:
+               return -ENOTTY;
+       }
+}
+
+#ifdef CONFIG_COMPAT
+static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
+                       unsigned int cmd, unsigned long arg)
+{
+       switch (cmd) {
+       case SG_IO:
+               return -ENOIOCTLCMD;
+       }
+       return nvme_ioctl(bdev, mode, cmd, arg);
+}
+#else
+#define nvme_compat_ioctl      NULL
+#endif
+
+static int nvme_open(struct block_device *bdev, fmode_t mode)
+{
+       return nvme_get_ns_from_disk(bdev->bd_disk) ? 0 : -ENXIO;
+}
+
+static void nvme_release(struct gendisk *disk, fmode_t mode)
+{
+       nvme_put_ns(disk->private_data);
+}
+
+static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+       /* some standard values */
+       geo->heads = 1 << 6;
+       geo->sectors = 1 << 5;
+       geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
+       return 0;
+}
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+static void nvme_init_integrity(struct nvme_ns *ns)
+{
+       struct blk_integrity integrity;
+
+       switch (ns->pi_type) {
+       case NVME_NS_DPS_PI_TYPE3:
+               integrity.profile = &t10_pi_type3_crc;
+               break;
+       case NVME_NS_DPS_PI_TYPE1:
+       case NVME_NS_DPS_PI_TYPE2:
+               integrity.profile = &t10_pi_type1_crc;
+               break;
+       default:
+               integrity.profile = NULL;
+               break;
+       }
+       integrity.tuple_size = ns->ms;
+       blk_integrity_register(ns->disk, &integrity);
+       blk_queue_max_integrity_segments(ns->queue, 1);
+}
+#else
+static void nvme_init_integrity(struct nvme_ns *ns)
+{
+}
+#endif /* CONFIG_BLK_DEV_INTEGRITY */
+
+static void nvme_config_discard(struct nvme_ns *ns)
+{
+       u32 logical_block_size = queue_logical_block_size(ns->queue);
+       ns->queue->limits.discard_zeroes_data = 0;
+       ns->queue->limits.discard_alignment = logical_block_size;
+       ns->queue->limits.discard_granularity = logical_block_size;
+       blk_queue_max_discard_sectors(ns->queue, 0xffffffff);
+       queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
+}
+
+static int nvme_revalidate_disk(struct gendisk *disk)
+{
+       struct nvme_ns *ns = disk->private_data;
+       struct nvme_id_ns *id;
+       u8 lbaf, pi_type;
+       u16 old_ms;
+       unsigned short bs;
+
+       if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) {
+               dev_warn(ns->ctrl->dev, "%s: Identify failure nvme%dn%d\n",
+                               __func__, ns->ctrl->instance, ns->ns_id);
+               return -ENODEV;
+       }
+       if (id->ncap == 0) {
+               kfree(id);
+               return -ENODEV;
+       }
+
+       if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) {
+               if (nvme_nvm_register(ns->queue, disk->disk_name)) {
+                       dev_warn(ns->ctrl->dev,
+                               "%s: LightNVM init failure\n", __func__);
+                       kfree(id);
+                       return -ENODEV;
+               }
+               ns->type = NVME_NS_LIGHTNVM;
+       }
+
+       if (ns->ctrl->vs >= NVME_VS(1, 1))
+               memcpy(ns->eui, id->eui64, sizeof(ns->eui));
+       if (ns->ctrl->vs >= NVME_VS(1, 2))
+               memcpy(ns->uuid, id->nguid, sizeof(ns->uuid));
+
+       old_ms = ns->ms;
+       lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
+       ns->lba_shift = id->lbaf[lbaf].ds;
+       ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
+       ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
+
+       /*
+        * If identify namespace failed, use default 512 byte block size so
+        * block layer can use before failing read/write for 0 capacity.
+        */
+       if (ns->lba_shift == 0)
+               ns->lba_shift = 9;
+       bs = 1 << ns->lba_shift;
+       /* XXX: PI implementation requires metadata equal t10 pi tuple size */
+       pi_type = ns->ms == sizeof(struct t10_pi_tuple) ?
+                                       id->dps & NVME_NS_DPS_PI_MASK : 0;
+
+       blk_mq_freeze_queue(disk->queue);
+       if (blk_get_integrity(disk) && (ns->pi_type != pi_type ||
+                               ns->ms != old_ms ||
+                               bs != queue_logical_block_size(disk->queue) ||
+                               (ns->ms && ns->ext)))
+               blk_integrity_unregister(disk);
+
+       ns->pi_type = pi_type;
+       blk_queue_logical_block_size(ns->queue, bs);
+
+       if (ns->ms && !blk_get_integrity(disk) && !ns->ext)
+               nvme_init_integrity(ns);
+       if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk))
+               set_capacity(disk, 0);
+       else
+               set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
+
+       if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM)
+               nvme_config_discard(ns);
+       blk_mq_unfreeze_queue(disk->queue);
+
+       kfree(id);
+       return 0;
+}
+
+static char nvme_pr_type(enum pr_type type)
+{
+       switch (type) {
+       case PR_WRITE_EXCLUSIVE:
+               return 1;
+       case PR_EXCLUSIVE_ACCESS:
+               return 2;
+       case PR_WRITE_EXCLUSIVE_REG_ONLY:
+               return 3;
+       case PR_EXCLUSIVE_ACCESS_REG_ONLY:
+               return 4;
+       case PR_WRITE_EXCLUSIVE_ALL_REGS:
+               return 5;
+       case PR_EXCLUSIVE_ACCESS_ALL_REGS:
+               return 6;
+       default:
+               return 0;
+       }
+};
+
+static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
+                               u64 key, u64 sa_key, u8 op)
+{
+       struct nvme_ns *ns = bdev->bd_disk->private_data;
+       struct nvme_command c;
+       u8 data[16] = { 0, };
+
+       put_unaligned_le64(key, &data[0]);
+       put_unaligned_le64(sa_key, &data[8]);
+
+       memset(&c, 0, sizeof(c));
+       c.common.opcode = op;
+       c.common.nsid = cpu_to_le32(ns->ns_id);
+       c.common.cdw10[0] = cpu_to_le32(cdw10);
+
+       return nvme_submit_sync_cmd(ns->queue, &c, data, 16);
+}
+
+static int nvme_pr_register(struct block_device *bdev, u64 old,
+               u64 new, unsigned flags)
+{
+       u32 cdw10;
+
+       if (flags & ~PR_FL_IGNORE_KEY)
+               return -EOPNOTSUPP;
+
+       cdw10 = old ? 2 : 0;
+       cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
+       cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
+       return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
+}
+
+static int nvme_pr_reserve(struct block_device *bdev, u64 key,
+               enum pr_type type, unsigned flags)
+{
+       u32 cdw10;
+
+       if (flags & ~PR_FL_IGNORE_KEY)
+               return -EOPNOTSUPP;
+
+       cdw10 = nvme_pr_type(type) << 8;
+       cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
+       return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
+}
+
+static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
+               enum pr_type type, bool abort)
+{
+       u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1;
+       return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
+}
+
+static int nvme_pr_clear(struct block_device *bdev, u64 key)
+{
+       u32 cdw10 = 1 | (key ? 1 << 3 : 0);
+       return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
+}
+
+static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
+{
+       u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0;
+       return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
+}
+
+static const struct pr_ops nvme_pr_ops = {
+       .pr_register    = nvme_pr_register,
+       .pr_reserve     = nvme_pr_reserve,
+       .pr_release     = nvme_pr_release,
+       .pr_preempt     = nvme_pr_preempt,
+       .pr_clear       = nvme_pr_clear,
+};
+
+static const struct block_device_operations nvme_fops = {
+       .owner          = THIS_MODULE,
+       .ioctl          = nvme_ioctl,
+       .compat_ioctl   = nvme_compat_ioctl,
+       .open           = nvme_open,
+       .release        = nvme_release,
+       .getgeo         = nvme_getgeo,
+       .revalidate_disk= nvme_revalidate_disk,
+       .pr_ops         = &nvme_pr_ops,
+};
+
+static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
+{
+       unsigned long timeout =
+               ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
+       u32 csts, bit = enabled ? NVME_CSTS_RDY : 0;
+       int ret;
+
+       while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
+               if ((csts & NVME_CSTS_RDY) == bit)
+                       break;
+
+               msleep(100);
+               if (fatal_signal_pending(current))
+                       return -EINTR;
+               if (time_after(jiffies, timeout)) {
+                       dev_err(ctrl->dev,
+                               "Device not ready; aborting %s\n", enabled ?
+                                               "initialisation" : "reset");
+                       return -ENODEV;
+               }
+       }
+
+       return ret;
+}
+
+/*
+ * If the device has been passed off to us in an enabled state, just clear
+ * the enabled bit.  The spec says we should set the 'shutdown notification
+ * bits', but doing so may cause the device to complete commands to the
+ * admin queue ... and we don't know what memory that might be pointing at!
+ */
+int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
+{
+       int ret;
+
+       ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
+       ctrl->ctrl_config &= ~NVME_CC_ENABLE;
+
+       ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
+       if (ret)
+               return ret;
+       return nvme_wait_ready(ctrl, cap, false);
+}
+
+int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
+{
+       /*
+        * Default to a 4K page size, with the intention to update this
+        * path in the future to accomodate architectures with differing
+        * kernel and IO page sizes.
+        */
+       unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12;
+       int ret;
+
+       if (page_shift < dev_page_min) {
+               dev_err(ctrl->dev,
+                       "Minimum device page size %u too large for host (%u)\n",
+                       1 << dev_page_min, 1 << page_shift);
+               return -ENODEV;
+       }
+
+       ctrl->page_size = 1 << page_shift;
+
+       ctrl->ctrl_config = NVME_CC_CSS_NVM;
+       ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
+       ctrl->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
+       ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
+       ctrl->ctrl_config |= NVME_CC_ENABLE;
+
+       ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
+       if (ret)
+               return ret;
+       return nvme_wait_ready(ctrl, cap, true);
+}
+
+int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
+{
+       unsigned long timeout = SHUTDOWN_TIMEOUT + jiffies;
+       u32 csts;
+       int ret;
+
+       ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
+       ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
+
+       ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
+       if (ret)
+               return ret;
+
+       while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
+               if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT)
+                       break;
+
+               msleep(100);
+               if (fatal_signal_pending(current))
+                       return -EINTR;
+               if (time_after(jiffies, timeout)) {
+                       dev_err(ctrl->dev,
+                               "Device shutdown incomplete; abort shutdown\n");
+                       return -ENODEV;
+               }
+       }
+
+       return ret;
+}
+
+/*
+ * Initialize the cached copies of the Identify data and various controller
+ * register in our nvme_ctrl structure.  This should be called as soon as
+ * the admin queue is fully up and running.
+ */
+int nvme_init_identify(struct nvme_ctrl *ctrl)
+{
+       struct nvme_id_ctrl *id;
+       u64 cap;
+       int ret, page_shift;
+
+       ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
+       if (ret) {
+               dev_err(ctrl->dev, "Reading VS failed (%d)\n", ret);
+               return ret;
+       }
+
+       ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap);
+       if (ret) {
+               dev_err(ctrl->dev, "Reading CAP failed (%d)\n", ret);
+               return ret;
+       }
+       page_shift = NVME_CAP_MPSMIN(cap) + 12;
+
+       if (ctrl->vs >= NVME_VS(1, 1))
+               ctrl->subsystem = NVME_CAP_NSSRC(cap);
+
+       ret = nvme_identify_ctrl(ctrl, &id);
+       if (ret) {
+               dev_err(ctrl->dev, "Identify Controller failed (%d)\n", ret);
+               return -EIO;
+       }
+
+       ctrl->oncs = le16_to_cpup(&id->oncs);
+       atomic_set(&ctrl->abort_limit, id->acl + 1);
+       ctrl->vwc = id->vwc;
+       memcpy(ctrl->serial, id->sn, sizeof(id->sn));
+       memcpy(ctrl->model, id->mn, sizeof(id->mn));
+       memcpy(ctrl->firmware_rev, id->fr, sizeof(id->fr));
+       if (id->mdts)
+               ctrl->max_hw_sectors = 1 << (id->mdts + page_shift - 9);
+       else
+               ctrl->max_hw_sectors = UINT_MAX;
+
+       if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && id->vs[3]) {
+               unsigned int max_hw_sectors;
+
+               ctrl->stripe_size = 1 << (id->vs[3] + page_shift);
+               max_hw_sectors = ctrl->stripe_size >> (page_shift - 9);
+               if (ctrl->max_hw_sectors) {
+                       ctrl->max_hw_sectors = min(max_hw_sectors,
+                                                       ctrl->max_hw_sectors);
+               } else {
+                       ctrl->max_hw_sectors = max_hw_sectors;
+               }
+       }
+
+       kfree(id);
+       return 0;
+}
+
+static int nvme_dev_open(struct inode *inode, struct file *file)
+{
+       struct nvme_ctrl *ctrl;
+       int instance = iminor(inode);
+       int ret = -ENODEV;
+
+       spin_lock(&dev_list_lock);
+       list_for_each_entry(ctrl, &nvme_ctrl_list, node) {
+               if (ctrl->instance != instance)
+                       continue;
+
+               if (!ctrl->admin_q) {
+                       ret = -EWOULDBLOCK;
+                       break;
+               }
+               if (!kref_get_unless_zero(&ctrl->kref))
+                       break;
+               file->private_data = ctrl;
+               ret = 0;
+               break;
+       }
+       spin_unlock(&dev_list_lock);
+
+       return ret;
+}
+
+static int nvme_dev_release(struct inode *inode, struct file *file)
+{
+       nvme_put_ctrl(file->private_data);
+       return 0;
+}
+
+static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
+{
+       struct nvme_ns *ns;
+       int ret;
+
+       mutex_lock(&ctrl->namespaces_mutex);
+       if (list_empty(&ctrl->namespaces)) {
+               ret = -ENOTTY;
+               goto out_unlock;
+       }
+
+       ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list);
+       if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) {
+               dev_warn(ctrl->dev,
+                       "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n");
+               ret = -EINVAL;
+               goto out_unlock;
+       }
+
+       dev_warn(ctrl->dev,
+               "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n");
+       kref_get(&ns->kref);
+       mutex_unlock(&ctrl->namespaces_mutex);
+
+       ret = nvme_user_cmd(ctrl, ns, argp);
+       nvme_put_ns(ns);
+       return ret;
+
+out_unlock:
+       mutex_unlock(&ctrl->namespaces_mutex);
+       return ret;
+}
+
+static long nvme_dev_ioctl(struct file *file, unsigned int cmd,
+               unsigned long arg)
+{
+       struct nvme_ctrl *ctrl = file->private_data;
+       void __user *argp = (void __user *)arg;
+
+       switch (cmd) {
+       case NVME_IOCTL_ADMIN_CMD:
+               return nvme_user_cmd(ctrl, NULL, argp);
+       case NVME_IOCTL_IO_CMD:
+               return nvme_dev_user_cmd(ctrl, argp);
+       case NVME_IOCTL_RESET:
+               dev_warn(ctrl->dev, "resetting controller\n");
+               return ctrl->ops->reset_ctrl(ctrl);
+       case NVME_IOCTL_SUBSYS_RESET:
+               return nvme_reset_subsystem(ctrl);
+       default:
+               return -ENOTTY;
+       }
+}
+
+static const struct file_operations nvme_dev_fops = {
+       .owner          = THIS_MODULE,
+       .open           = nvme_dev_open,
+       .release        = nvme_dev_release,
+       .unlocked_ioctl = nvme_dev_ioctl,
+       .compat_ioctl   = nvme_dev_ioctl,
+};
+
+static ssize_t nvme_sysfs_reset(struct device *dev,
+                               struct device_attribute *attr, const char *buf,
+                               size_t count)
+{
+       struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+       int ret;
+
+       ret = ctrl->ops->reset_ctrl(ctrl);
+       if (ret < 0)
+               return ret;
+       return count;
+}
+static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);
+
+static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
+                                                               char *buf)
+{
+       struct nvme_ns *ns = dev_to_disk(dev)->private_data;
+       return sprintf(buf, "%pU\n", ns->uuid);
+}
+static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL);
+
+static ssize_t eui_show(struct device *dev, struct device_attribute *attr,
+                                                               char *buf)
+{
+       struct nvme_ns *ns = dev_to_disk(dev)->private_data;
+       return sprintf(buf, "%8phd\n", ns->eui);
+}
+static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL);
+
+static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
+                                                               char *buf)
+{
+       struct nvme_ns *ns = dev_to_disk(dev)->private_data;
+       return sprintf(buf, "%d\n", ns->ns_id);
+}
+static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL);
+
+static struct attribute *nvme_ns_attrs[] = {
+       &dev_attr_uuid.attr,
+       &dev_attr_eui.attr,
+       &dev_attr_nsid.attr,
+       NULL,
+};
+
+static umode_t nvme_attrs_are_visible(struct kobject *kobj,
+               struct attribute *a, int n)
+{
+       struct device *dev = container_of(kobj, struct device, kobj);
+       struct nvme_ns *ns = dev_to_disk(dev)->private_data;
+
+       if (a == &dev_attr_uuid.attr) {
+               if (!memchr_inv(ns->uuid, 0, sizeof(ns->uuid)))
+                       return 0;
+       }
+       if (a == &dev_attr_eui.attr) {
+               if (!memchr_inv(ns->eui, 0, sizeof(ns->eui)))
+                       return 0;
+       }
+       return a->mode;
+}
+
+static const struct attribute_group nvme_ns_attr_group = {
+       .attrs          = nvme_ns_attrs,
+       .is_visible     = nvme_attrs_are_visible,
+};
+
+#define nvme_show_function(field)                                              \
+static ssize_t  field##_show(struct device *dev,                               \
+                           struct device_attribute *attr, char *buf)           \
+{                                                                              \
+        struct nvme_ctrl *ctrl = dev_get_drvdata(dev);                         \
+        return sprintf(buf, "%.*s\n", (int)sizeof(ctrl->field), ctrl->field);  \
+}                                                                              \
+static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
+
+nvme_show_function(model);
+nvme_show_function(serial);
+nvme_show_function(firmware_rev);
+
+static struct attribute *nvme_dev_attrs[] = {
+       &dev_attr_reset_controller.attr,
+       &dev_attr_model.attr,
+       &dev_attr_serial.attr,
+       &dev_attr_firmware_rev.attr,
+       NULL
+};
+
+static struct attribute_group nvme_dev_attrs_group = {
+       .attrs = nvme_dev_attrs,
+};
+
+static const struct attribute_group *nvme_dev_attr_groups[] = {
+       &nvme_dev_attrs_group,
+       NULL,
+};
+
+static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+       struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
+       struct nvme_ns *nsb = container_of(b, struct nvme_ns, list);
+
+       return nsa->ns_id - nsb->ns_id;
+}
+
+static struct nvme_ns *nvme_find_ns(struct nvme_ctrl *ctrl, unsigned nsid)
+{
+       struct nvme_ns *ns;
+
+       lockdep_assert_held(&ctrl->namespaces_mutex);
+
+       list_for_each_entry(ns, &ctrl->namespaces, list) {
+               if (ns->ns_id == nsid)
+                       return ns;
+               if (ns->ns_id > nsid)
+                       break;
+       }
+       return NULL;
+}
+
+static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
+{
+       struct nvme_ns *ns;
+       struct gendisk *disk;
+       int node = dev_to_node(ctrl->dev);
+
+       lockdep_assert_held(&ctrl->namespaces_mutex);
+
+       ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
+       if (!ns)
+               return;
+
+       ns->queue = blk_mq_init_queue(ctrl->tagset);
+       if (IS_ERR(ns->queue))
+               goto out_free_ns;
+       queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
+       queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
+       ns->queue->queuedata = ns;
+       ns->ctrl = ctrl;
+
+       disk = alloc_disk_node(0, node);
+       if (!disk)
+               goto out_free_queue;
+
+       kref_init(&ns->kref);
+       ns->ns_id = nsid;
+       ns->disk = disk;
+       ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
+
+       blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
+       if (ctrl->max_hw_sectors) {
+               blk_queue_max_hw_sectors(ns->queue, ctrl->max_hw_sectors);
+               blk_queue_max_segments(ns->queue,
+                       (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1);
+       }
+       if (ctrl->stripe_size)
+               blk_queue_chunk_sectors(ns->queue, ctrl->stripe_size >> 9);
+       if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
+               blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA);
+       blk_queue_virt_boundary(ns->queue, ctrl->page_size - 1);
+
+       disk->major = nvme_major;
+       disk->first_minor = 0;
+       disk->fops = &nvme_fops;
+       disk->private_data = ns;
+       disk->queue = ns->queue;
+       disk->driverfs_dev = ctrl->device;
+       disk->flags = GENHD_FL_EXT_DEVT;
+       sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, nsid);
+
+       if (nvme_revalidate_disk(ns->disk))
+               goto out_free_disk;
+
+       list_add_tail(&ns->list, &ctrl->namespaces);
+       kref_get(&ctrl->kref);
+       if (ns->type == NVME_NS_LIGHTNVM)
+               return;
+
+       add_disk(ns->disk);
+       if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
+                                       &nvme_ns_attr_group))
+               pr_warn("%s: failed to create sysfs group for identification\n",
+                       ns->disk->disk_name);
+       return;
+ out_free_disk:
+       kfree(disk);
+ out_free_queue:
+       blk_cleanup_queue(ns->queue);
+ out_free_ns:
+       kfree(ns);
+}
+
+static void nvme_ns_remove(struct nvme_ns *ns)
+{
+       bool kill = nvme_io_incapable(ns->ctrl) &&
+                       !blk_queue_dying(ns->queue);
+
+       lockdep_assert_held(&ns->ctrl->namespaces_mutex);
+
+       if (kill) {
+               blk_set_queue_dying(ns->queue);
+
+               /*
+                * The controller was shutdown first if we got here through
+                * device removal. The shutdown may requeue outstanding
+                * requests. These need to be aborted immediately so
+                * del_gendisk doesn't block indefinitely for their completion.
+                */
+               blk_mq_abort_requeue_list(ns->queue);
+       }
+       if (ns->disk->flags & GENHD_FL_UP) {
+               if (blk_get_integrity(ns->disk))
+                       blk_integrity_unregister(ns->disk);
+               sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
+                                       &nvme_ns_attr_group);
+               del_gendisk(ns->disk);
+       }
+       if (kill || !blk_queue_dying(ns->queue)) {
+               blk_mq_abort_requeue_list(ns->queue);
+               blk_cleanup_queue(ns->queue);
+       }
+       list_del_init(&ns->list);
+       nvme_put_ns(ns);
+}
+
+static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
+{
+       struct nvme_ns *ns;
+
+       ns = nvme_find_ns(ctrl, nsid);
+       if (ns) {
+               if (revalidate_disk(ns->disk))
+                       nvme_ns_remove(ns);
+       } else
+               nvme_alloc_ns(ctrl, nsid);
+}
+
+static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn)
+{
+       struct nvme_ns *ns;
+       __le32 *ns_list;
+       unsigned i, j, nsid, prev = 0, num_lists = DIV_ROUND_UP(nn, 1024);
+       int ret = 0;
+
+       ns_list = kzalloc(0x1000, GFP_KERNEL);
+       if (!ns_list)
+               return -ENOMEM;
+
+       for (i = 0; i < num_lists; i++) {
+               ret = nvme_identify_ns_list(ctrl, prev, ns_list);
+               if (ret)
+                       goto out;
+
+               for (j = 0; j < min(nn, 1024U); j++) {
+                       nsid = le32_to_cpu(ns_list[j]);
+                       if (!nsid)
+                               goto out;
+
+                       nvme_validate_ns(ctrl, nsid);
+
+                       while (++prev < nsid) {
+                               ns = nvme_find_ns(ctrl, prev);
+                               if (ns)
+                                       nvme_ns_remove(ns);
+                       }
+               }
+               nn -= j;
+       }
+ out:
+       kfree(ns_list);
+       return ret;
+}
+
+static void __nvme_scan_namespaces(struct nvme_ctrl *ctrl, unsigned nn)
+{
+       struct nvme_ns *ns, *next;
+       unsigned i;
+
+       lockdep_assert_held(&ctrl->namespaces_mutex);
+
+       for (i = 1; i <= nn; i++)
+               nvme_validate_ns(ctrl, i);
+
+       list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
+               if (ns->ns_id > nn)
+                       nvme_ns_remove(ns);
+       }
+}
+
+void nvme_scan_namespaces(struct nvme_ctrl *ctrl)
+{
+       struct nvme_id_ctrl *id;
+       unsigned nn;
+
+       if (nvme_identify_ctrl(ctrl, &id))
+               return;
+
+       mutex_lock(&ctrl->namespaces_mutex);
+       nn = le32_to_cpu(id->nn);
+       if (ctrl->vs >= NVME_VS(1, 1) &&
+           !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) {
+               if (!nvme_scan_ns_list(ctrl, nn))
+                       goto done;
+       }
+       __nvme_scan_namespaces(ctrl, le32_to_cpup(&id->nn));
+ done:
+       list_sort(NULL, &ctrl->namespaces, ns_cmp);
+       mutex_unlock(&ctrl->namespaces_mutex);
+       kfree(id);
+}
+
+void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
+{
+       struct nvme_ns *ns, *next;
+
+       mutex_lock(&ctrl->namespaces_mutex);
+       list_for_each_entry_safe(ns, next, &ctrl->namespaces, list)
+               nvme_ns_remove(ns);
+       mutex_unlock(&ctrl->namespaces_mutex);
+}
+
+static DEFINE_IDA(nvme_instance_ida);
+
+static int nvme_set_instance(struct nvme_ctrl *ctrl)
+{
+       int instance, error;
+
+       do {
+               if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL))
+                       return -ENODEV;
+
+               spin_lock(&dev_list_lock);
+               error = ida_get_new(&nvme_instance_ida, &instance);
+               spin_unlock(&dev_list_lock);
+       } while (error == -EAGAIN);
+
+       if (error)
+               return -ENODEV;
+
+       ctrl->instance = instance;
+       return 0;
+}
+
+static void nvme_release_instance(struct nvme_ctrl *ctrl)
+{
+       spin_lock(&dev_list_lock);
+       ida_remove(&nvme_instance_ida, ctrl->instance);
+       spin_unlock(&dev_list_lock);
+}
+
+void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
+ {
+       device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance));
+
+       spin_lock(&dev_list_lock);
+       list_del(&ctrl->node);
+       spin_unlock(&dev_list_lock);
+}
+
+static void nvme_free_ctrl(struct kref *kref)
+{
+       struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref);
+
+       put_device(ctrl->device);
+       nvme_release_instance(ctrl);
+
+       ctrl->ops->free_ctrl(ctrl);
+}
+
+void nvme_put_ctrl(struct nvme_ctrl *ctrl)
+{
+       kref_put(&ctrl->kref, nvme_free_ctrl);
+}
+
+/*
+ * Initialize a NVMe controller structures.  This needs to be called during
+ * earliest initialization so that we have the initialized structured around
+ * during probing.
+ */
+int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
+               const struct nvme_ctrl_ops *ops, unsigned long quirks)
+{
+       int ret;
+
+       INIT_LIST_HEAD(&ctrl->namespaces);
+       mutex_init(&ctrl->namespaces_mutex);
+       kref_init(&ctrl->kref);
+       ctrl->dev = dev;
+       ctrl->ops = ops;
+       ctrl->quirks = quirks;
+
+       ret = nvme_set_instance(ctrl);
+       if (ret)
+               goto out;
+
+       ctrl->device = device_create_with_groups(nvme_class, ctrl->dev,
+                               MKDEV(nvme_char_major, ctrl->instance),
+                               dev, nvme_dev_attr_groups,
+                               "nvme%d", ctrl->instance);
+       if (IS_ERR(ctrl->device)) {
+               ret = PTR_ERR(ctrl->device);
+               goto out_release_instance;
+       }
+       get_device(ctrl->device);
+       dev_set_drvdata(ctrl->device, ctrl);
+
+       spin_lock(&dev_list_lock);
+       list_add_tail(&ctrl->node, &nvme_ctrl_list);
+       spin_unlock(&dev_list_lock);
+
+       return 0;
+out_release_instance:
+       nvme_release_instance(ctrl);
+out:
+       return ret;
+}
+
+void nvme_stop_queues(struct nvme_ctrl *ctrl)
+{
+       struct nvme_ns *ns;
+
+       mutex_lock(&ctrl->namespaces_mutex);
+       list_for_each_entry(ns, &ctrl->namespaces, list) {
+               spin_lock_irq(ns->queue->queue_lock);
+               queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue);
+               spin_unlock_irq(ns->queue->queue_lock);
+
+               blk_mq_cancel_requeue_work(ns->queue);
+               blk_mq_stop_hw_queues(ns->queue);
+       }
+       mutex_unlock(&ctrl->namespaces_mutex);
+}
+
+void nvme_start_queues(struct nvme_ctrl *ctrl)
+{
+       struct nvme_ns *ns;
+
+       mutex_lock(&ctrl->namespaces_mutex);
+       list_for_each_entry(ns, &ctrl->namespaces, list) {
+               queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue);
+               blk_mq_start_stopped_hw_queues(ns->queue, true);
+               blk_mq_kick_requeue_list(ns->queue);
+       }
+       mutex_unlock(&ctrl->namespaces_mutex);
+}
+
+int __init nvme_core_init(void)
+{
+       int result;
+
+       result = register_blkdev(nvme_major, "nvme");
+       if (result < 0)
+               return result;
+       else if (result > 0)
+               nvme_major = result;
+
+       result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme",
+                                                       &nvme_dev_fops);
+       if (result < 0)
+               goto unregister_blkdev;
+       else if (result > 0)
+               nvme_char_major = result;
+
+       nvme_class = class_create(THIS_MODULE, "nvme");
+       if (IS_ERR(nvme_class)) {
+               result = PTR_ERR(nvme_class);
+               goto unregister_chrdev;
+       }
+
+       return 0;
+
+ unregister_chrdev:
+       __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
+ unregister_blkdev:
+       unregister_blkdev(nvme_major, "nvme");
+       return result;
+}
+
+void nvme_core_exit(void)
+{
+       unregister_blkdev(nvme_major, "nvme");
+       class_destroy(nvme_class);
+       __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
+}
index 1af54ea20e7bd55ff243598acfde869a106030d2..5cd3725e2fa44ae7379a81a6f1bf03bf0d45e895 100644 (file)
@@ -146,6 +146,16 @@ struct nvme_nvm_command {
        };
 };
 
+struct nvme_nvm_lp_mlc {
+       __u16                   num_pairs;
+       __u8                    pairs[886];
+};
+
+struct nvme_nvm_lp_tbl {
+       __u8                    id[8];
+       struct nvme_nvm_lp_mlc  mlc;
+};
+
 struct nvme_nvm_id_group {
        __u8                    mtype;
        __u8                    fmtype;
@@ -169,7 +179,8 @@ struct nvme_nvm_id_group {
        __le32                  mpos;
        __le32                  mccap;
        __le16                  cpar;
-       __u8                    reserved[906];
+       __u8                    reserved[10];
+       struct nvme_nvm_lp_tbl lptbl;
 } __packed;
 
 struct nvme_nvm_addr_format {
@@ -266,6 +277,15 @@ static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id)
                dst->mccap = le32_to_cpu(src->mccap);
 
                dst->cpar = le16_to_cpu(src->cpar);
+
+               if (dst->fmtype == NVM_ID_FMTYPE_MLC) {
+                       memcpy(dst->lptbl.id, src->lptbl.id, 8);
+                       dst->lptbl.mlc.num_pairs =
+                                       le16_to_cpu(src->lptbl.mlc.num_pairs);
+                       /* 4 bits per pair */
+                       memcpy(dst->lptbl.mlc.pairs, src->lptbl.mlc.pairs,
+                                               dst->lptbl.mlc.num_pairs >> 1);
+               }
        }
 
        return 0;
@@ -274,7 +294,6 @@ static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id)
 static int nvme_nvm_identity(struct nvm_dev *nvmdev, struct nvm_id *nvm_id)
 {
        struct nvme_ns *ns = nvmdev->q->queuedata;
-       struct nvme_dev *dev = ns->dev;
        struct nvme_nvm_id *nvme_nvm_id;
        struct nvme_nvm_command c = {};
        int ret;
@@ -287,7 +306,7 @@ static int nvme_nvm_identity(struct nvm_dev *nvmdev, struct nvm_id *nvm_id)
        if (!nvme_nvm_id)
                return -ENOMEM;
 
-       ret = nvme_submit_sync_cmd(dev->admin_q, (struct nvme_command *)&c,
+       ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c,
                                nvme_nvm_id, sizeof(struct nvme_nvm_id));
        if (ret) {
                ret = -EIO;
@@ -312,9 +331,8 @@ static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb,
                                nvm_l2p_update_fn *update_l2p, void *priv)
 {
        struct nvme_ns *ns = nvmdev->q->queuedata;
-       struct nvme_dev *dev = ns->dev;
        struct nvme_nvm_command c = {};
-       u32 len = queue_max_hw_sectors(dev->admin_q) << 9;
+       u32 len = queue_max_hw_sectors(ns->ctrl->admin_q) << 9;
        u32 nlb_pr_rq = len / sizeof(u64);
        u64 cmd_slba = slba;
        void *entries;
@@ -332,10 +350,10 @@ static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb,
                c.l2p.slba = cpu_to_le64(cmd_slba);
                c.l2p.nlb = cpu_to_le32(cmd_nlb);
 
-               ret = nvme_submit_sync_cmd(dev->admin_q,
+               ret = nvme_submit_sync_cmd(ns->ctrl->admin_q,
                                (struct nvme_command *)&c, entries, len);
                if (ret) {
-                       dev_err(dev->dev, "L2P table transfer failed (%d)\n",
+                       dev_err(ns->ctrl->dev, "L2P table transfer failed (%d)\n",
                                                                        ret);
                        ret = -EIO;
                        goto out;
@@ -361,7 +379,7 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
 {
        struct request_queue *q = nvmdev->q;
        struct nvme_ns *ns = q->queuedata;
-       struct nvme_dev *dev = ns->dev;
+       struct nvme_ctrl *ctrl = ns->ctrl;
        struct nvme_nvm_command c = {};
        struct nvme_nvm_bb_tbl *bb_tbl;
        int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blocks;
@@ -375,41 +393,36 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
        if (!bb_tbl)
                return -ENOMEM;
 
-       ret = nvme_submit_sync_cmd(dev->admin_q, (struct nvme_command *)&c,
+       ret = nvme_submit_sync_cmd(ctrl->admin_q, (struct nvme_command *)&c,
                                                                bb_tbl, tblsz);
        if (ret) {
-               dev_err(dev->dev, "get bad block table failed (%d)\n", ret);
+               dev_err(ctrl->dev, "get bad block table failed (%d)\n", ret);
                ret = -EIO;
                goto out;
        }
 
        if (bb_tbl->tblid[0] != 'B' || bb_tbl->tblid[1] != 'B' ||
                bb_tbl->tblid[2] != 'L' || bb_tbl->tblid[3] != 'T') {
-               dev_err(dev->dev, "bbt format mismatch\n");
+               dev_err(ctrl->dev, "bbt format mismatch\n");
                ret = -EINVAL;
                goto out;
        }
 
        if (le16_to_cpu(bb_tbl->verid) != 1) {
                ret = -EINVAL;
-               dev_err(dev->dev, "bbt version not supported\n");
+               dev_err(ctrl->dev, "bbt version not supported\n");
                goto out;
        }
 
        if (le32_to_cpu(bb_tbl->tblks) != nr_blocks) {
                ret = -EINVAL;
-               dev_err(dev->dev, "bbt unsuspected blocks returned (%u!=%u)",
+               dev_err(ctrl->dev, "bbt unsuspected blocks returned (%u!=%u)",
                                        le32_to_cpu(bb_tbl->tblks), nr_blocks);
                goto out;
        }
 
        ppa = dev_to_generic_addr(nvmdev, ppa);
        ret = update_bbtbl(ppa, nr_blocks, bb_tbl->blk, priv);
-       if (ret) {
-               ret = -EINTR;
-               goto out;
-       }
-
 out:
        kfree(bb_tbl);
        return ret;
@@ -419,7 +432,6 @@ static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct nvm_rq *rqd,
                                                                int type)
 {
        struct nvme_ns *ns = nvmdev->q->queuedata;
-       struct nvme_dev *dev = ns->dev;
        struct nvme_nvm_command c = {};
        int ret = 0;
 
@@ -429,10 +441,10 @@ static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct nvm_rq *rqd,
        c.set_bb.nlb = cpu_to_le16(rqd->nr_pages - 1);
        c.set_bb.value = type;
 
-       ret = nvme_submit_sync_cmd(dev->admin_q, (struct nvme_command *)&c,
+       ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c,
                                                                NULL, 0);
        if (ret)
-               dev_err(dev->dev, "set bad block table failed (%d)\n", ret);
+               dev_err(ns->ctrl->dev, "set bad block table failed (%d)\n", ret);
        return ret;
 }
 
@@ -453,11 +465,8 @@ static inline void nvme_nvm_rqtocmd(struct request *rq, struct nvm_rq *rqd,
 static void nvme_nvm_end_io(struct request *rq, int error)
 {
        struct nvm_rq *rqd = rq->end_io_data;
-       struct nvm_dev *dev = rqd->dev;
 
-       if (dev->mt && dev->mt->end_io(rqd, error))
-               pr_err("nvme: err status: %x result: %lx\n",
-                               rq->errors, (unsigned long)rq->special);
+       nvm_end_io(rqd, error);
 
        kfree(rq->cmd);
        blk_mq_free_request(rq);
@@ -520,9 +529,8 @@ static int nvme_nvm_erase_block(struct nvm_dev *dev, struct nvm_rq *rqd)
 static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name)
 {
        struct nvme_ns *ns = nvmdev->q->queuedata;
-       struct nvme_dev *dev = ns->dev;
 
-       return dma_pool_create(name, dev->dev, PAGE_SIZE, PAGE_SIZE, 0);
+       return dma_pool_create(name, ns->ctrl->dev, PAGE_SIZE, PAGE_SIZE, 0);
 }
 
 static void nvme_nvm_destroy_dma_pool(void *pool)
@@ -580,8 +588,9 @@ void nvme_nvm_unregister(struct request_queue *q, char *disk_name)
 
 int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id)
 {
-       struct nvme_dev *dev = ns->dev;
-       struct pci_dev *pdev = to_pci_dev(dev->dev);
+       struct nvme_ctrl *ctrl = ns->ctrl;
+       /* XXX: this is poking into PCI structures from generic code! */
+       struct pci_dev *pdev = to_pci_dev(ctrl->dev);
 
        /* QEMU NVMe simulator - PCI ID + Vendor specific bit */
        if (pdev->vendor == PCI_VENDOR_ID_CNEX &&
index 044253dca30a433549a37ec7e28c3a6c161d38a5..4fb5bb737868ce2db7da41700cd238c8804b086c 100644 (file)
 #include <linux/kref.h>
 #include <linux/blk-mq.h>
 
+enum {
+       /*
+        * Driver internal status code for commands that were cancelled due
+        * to timeouts or controller shutdown.  The value is negative so
+        * that it a) doesn't overlap with the unsigned hardware error codes,
+        * and b) can easily be tested for.
+        */
+       NVME_SC_CANCELLED               = -EINTR,
+};
+
 extern unsigned char nvme_io_timeout;
 #define NVME_IO_TIMEOUT        (nvme_io_timeout * HZ)
 
+extern unsigned char admin_timeout;
+#define ADMIN_TIMEOUT  (admin_timeout * HZ)
+
+extern unsigned char shutdown_timeout;
+#define SHUTDOWN_TIMEOUT       (shutdown_timeout * HZ)
+
 enum {
        NVME_NS_LBA             = 0,
        NVME_NS_LIGHTNVM        = 1,
 };
 
 /*
- * Represents an NVM Express device.  Each nvme_dev is a PCI function.
+ * List of workarounds for devices that required behavior not specified in
+ * the standard.
  */
-struct nvme_dev {
-       struct list_head node;
-       struct nvme_queue **queues;
+enum nvme_quirks {
+       /*
+        * Prefers I/O aligned to a stripe size specified in a vendor
+        * specific Identify field.
+        */
+       NVME_QUIRK_STRIPE_SIZE                  = (1 << 0),
+
+       /*
+        * The controller doesn't handle Identify value others than 0 or 1
+        * correctly.
+        */
+       NVME_QUIRK_IDENTIFY_CNS                 = (1 << 1),
+};
+
+struct nvme_ctrl {
+       const struct nvme_ctrl_ops *ops;
        struct request_queue *admin_q;
-       struct blk_mq_tag_set tagset;
-       struct blk_mq_tag_set admin_tagset;
-       u32 __iomem *dbs;
        struct device *dev;
-       struct dma_pool *prp_page_pool;
-       struct dma_pool *prp_small_pool;
+       struct kref kref;
        int instance;
-       unsigned queue_count;
-       unsigned online_queues;
-       unsigned max_qid;
-       int q_depth;
-       u32 db_stride;
-       u32 ctrl_config;
-       struct msix_entry *entry;
-       struct nvme_bar __iomem *bar;
+       struct blk_mq_tag_set *tagset;
        struct list_head namespaces;
-       struct kref kref;
-       struct device *device;
-       struct work_struct reset_work;
-       struct work_struct probe_work;
-       struct work_struct scan_work;
+       struct mutex namespaces_mutex;
+       struct device *device;  /* char device */
+       struct list_head node;
+
        char name[12];
        char serial[20];
        char model[40];
        char firmware_rev[8];
-       bool subsystem;
+
+       u32 ctrl_config;
+
+       u32 page_size;
        u32 max_hw_sectors;
        u32 stripe_size;
-       u32 page_size;
-       void __iomem *cmb;
-       dma_addr_t cmb_dma_addr;
-       u64 cmb_size;
-       u32 cmbsz;
        u16 oncs;
-       u16 abort_limit;
+       atomic_t abort_limit;
        u8 event_limit;
        u8 vwc;
+       u32 vs;
+       bool subsystem;
+       unsigned long quirks;
 };
 
 /*
@@ -79,11 +98,14 @@ struct nvme_dev {
 struct nvme_ns {
        struct list_head list;
 
-       struct nvme_dev *dev;
+       struct nvme_ctrl *ctrl;
        struct request_queue *queue;
        struct gendisk *disk;
        struct kref kref;
 
+       u8 eui[8];
+       u8 uuid[16];
+
        unsigned ns_id;
        int lba_shift;
        u16 ms;
@@ -94,41 +116,156 @@ struct nvme_ns {
        u32 mode_select_block_len;
 };
 
-/*
- * The nvme_iod describes the data in an I/O, including the list of PRP
- * entries.  You can't see it in this data structure because C doesn't let
- * me express that.  Use nvme_alloc_iod to ensure there's enough space
- * allocated to store the PRP list.
- */
-struct nvme_iod {
-       unsigned long private;  /* For the use of the submitter of the I/O */
-       int npages;             /* In the PRP list. 0 means small pool in use */
-       int offset;             /* Of PRP list */
-       int nents;              /* Used in scatterlist */
-       int length;             /* Of data, in bytes */
-       dma_addr_t first_dma;
-       struct scatterlist meta_sg[1]; /* metadata requires single contiguous buffer */
-       struct scatterlist sg[0];
+struct nvme_ctrl_ops {
+       int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val);
+       int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val);
+       int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val);
+       bool (*io_incapable)(struct nvme_ctrl *ctrl);
+       int (*reset_ctrl)(struct nvme_ctrl *ctrl);
+       void (*free_ctrl)(struct nvme_ctrl *ctrl);
 };
 
+static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl)
+{
+       u32 val = 0;
+
+       if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &val))
+               return false;
+       return val & NVME_CSTS_RDY;
+}
+
+static inline bool nvme_io_incapable(struct nvme_ctrl *ctrl)
+{
+       u32 val = 0;
+
+       if (ctrl->ops->io_incapable(ctrl))
+               return false;
+       if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &val))
+               return false;
+       return val & NVME_CSTS_CFS;
+}
+
+static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl)
+{
+       if (!ctrl->subsystem)
+               return -ENOTTY;
+       return ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, 0x4E564D65);
+}
+
 static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector)
 {
        return (sector >> (ns->lba_shift - 9));
 }
 
+static inline void nvme_setup_flush(struct nvme_ns *ns,
+               struct nvme_command *cmnd)
+{
+       memset(cmnd, 0, sizeof(*cmnd));
+       cmnd->common.opcode = nvme_cmd_flush;
+       cmnd->common.nsid = cpu_to_le32(ns->ns_id);
+}
+
+static inline void nvme_setup_rw(struct nvme_ns *ns, struct request *req,
+               struct nvme_command *cmnd)
+{
+       u16 control = 0;
+       u32 dsmgmt = 0;
+
+       if (req->cmd_flags & REQ_FUA)
+               control |= NVME_RW_FUA;
+       if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
+               control |= NVME_RW_LR;
+
+       if (req->cmd_flags & REQ_RAHEAD)
+               dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
+
+       memset(cmnd, 0, sizeof(*cmnd));
+       cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
+       cmnd->rw.command_id = req->tag;
+       cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
+       cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
+       cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
+
+       if (ns->ms) {
+               switch (ns->pi_type) {
+               case NVME_NS_DPS_PI_TYPE3:
+                       control |= NVME_RW_PRINFO_PRCHK_GUARD;
+                       break;
+               case NVME_NS_DPS_PI_TYPE1:
+               case NVME_NS_DPS_PI_TYPE2:
+                       control |= NVME_RW_PRINFO_PRCHK_GUARD |
+                                       NVME_RW_PRINFO_PRCHK_REF;
+                       cmnd->rw.reftag = cpu_to_le32(
+                                       nvme_block_nr(ns, blk_rq_pos(req)));
+                       break;
+               }
+               if (!blk_integrity_rq(req))
+                       control |= NVME_RW_PRINFO_PRACT;
+       }
+
+       cmnd->rw.control = cpu_to_le16(control);
+       cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
+}
+
+
+static inline int nvme_error_status(u16 status)
+{
+       switch (status & 0x7ff) {
+       case NVME_SC_SUCCESS:
+               return 0;
+       case NVME_SC_CAP_EXCEEDED:
+               return -ENOSPC;
+       default:
+               return -EIO;
+       }
+}
+
+static inline bool nvme_req_needs_retry(struct request *req, u16 status)
+{
+       return !(status & NVME_SC_DNR || blk_noretry_request(req)) &&
+               (jiffies - req->start_time) < req->timeout;
+}
+
+int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap);
+int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap);
+int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl);
+int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
+               const struct nvme_ctrl_ops *ops, unsigned long quirks);
+void nvme_uninit_ctrl(struct nvme_ctrl *ctrl);
+void nvme_put_ctrl(struct nvme_ctrl *ctrl);
+int nvme_init_identify(struct nvme_ctrl *ctrl);
+
+void nvme_scan_namespaces(struct nvme_ctrl *ctrl);
+void nvme_remove_namespaces(struct nvme_ctrl *ctrl);
+
+void nvme_stop_queues(struct nvme_ctrl *ctrl);
+void nvme_start_queues(struct nvme_ctrl *ctrl);
+
+struct request *nvme_alloc_request(struct request_queue *q,
+               struct nvme_command *cmd, unsigned int flags);
+void nvme_requeue_req(struct request *req);
 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
                void *buf, unsigned bufflen);
 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
-               void *buffer, void __user *ubuffer, unsigned bufflen,
+               void *buffer, unsigned bufflen,  u32 *result, unsigned timeout);
+int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
+               void __user *ubuffer, unsigned bufflen, u32 *result,
+               unsigned timeout);
+int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
+               void __user *ubuffer, unsigned bufflen,
+               void __user *meta_buffer, unsigned meta_len, u32 meta_seed,
                u32 *result, unsigned timeout);
-int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id);
-int nvme_identify_ns(struct nvme_dev *dev, unsigned nsid,
+int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id);
+int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid,
                struct nvme_id_ns **id);
-int nvme_get_log_page(struct nvme_dev *dev, struct nvme_smart_log **log);
-int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid,
+int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log);
+int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid,
                        dma_addr_t dma_addr, u32 *result);
-int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11,
+int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
                        dma_addr_t dma_addr, u32 *result);
+int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
+
+extern spinlock_t dev_list_lock;
 
 struct sg_io_hdr;
 
@@ -154,4 +291,7 @@ static inline int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *i
 }
 #endif /* CONFIG_NVM */
 
+int __init nvme_core_init(void);
+void nvme_core_exit(void);
+
 #endif /* _NVME_H */
index f5c0e2613c7cc87fa04ec3bc30f51b14aa1c8278..72ef8322d32ac7180912e2ae2cf38245f3713d9a 100644 (file)
@@ -12,6 +12,7 @@
  * more details.
  */
 
+#include <linux/aer.h>
 #include <linux/bitops.h>
 #include <linux/blkdev.h>
 #include <linux/blk-mq.h>
 #include <linux/kdev_t.h>
 #include <linux/kthread.h>
 #include <linux/kernel.h>
-#include <linux/list_sort.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
+#include <linux/mutex.h>
 #include <linux/pci.h>
 #include <linux/poison.h>
 #include <linux/ptrace.h>
 #include <linux/slab.h>
 #include <linux/t10-pi.h>
 #include <linux/types.h>
-#include <linux/pr.h>
-#include <scsi/sg.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <asm/unaligned.h>
 
-#include <uapi/linux/nvme_ioctl.h>
 #include "nvme.h"
 
-#define NVME_MINORS            (1U << MINORBITS)
 #define NVME_Q_DEPTH           1024
 #define NVME_AQ_DEPTH          256
 #define SQ_SIZE(depth)         (depth * sizeof(struct nvme_command))
 #define CQ_SIZE(depth)         (depth * sizeof(struct nvme_completion))
-#define ADMIN_TIMEOUT          (admin_timeout * HZ)
-#define SHUTDOWN_TIMEOUT       (shutdown_timeout * HZ)
+               
+/*
+ * We handle AEN commands ourselves and don't even let the
+ * block layer know about them.
+ */
+#define NVME_NR_AEN_COMMANDS   1
+#define NVME_AQ_BLKMQ_DEPTH    (NVME_AQ_DEPTH - NVME_NR_AEN_COMMANDS)
 
-static unsigned char admin_timeout = 60;
+unsigned char admin_timeout = 60;
 module_param(admin_timeout, byte, 0644);
 MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
 
@@ -63,16 +65,10 @@ unsigned char nvme_io_timeout = 30;
 module_param_named(io_timeout, nvme_io_timeout, byte, 0644);
 MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
 
-static unsigned char shutdown_timeout = 5;
+unsigned char shutdown_timeout = 5;
 module_param(shutdown_timeout, byte, 0644);
 MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
 
-static int nvme_major;
-module_param(nvme_major, int, 0);
-
-static int nvme_char_major;
-module_param(nvme_char_major, int, 0);
-
 static int use_threaded_interrupts;
 module_param(use_threaded_interrupts, int, 0);
 
@@ -80,28 +76,60 @@ static bool use_cmb_sqes = true;
 module_param(use_cmb_sqes, bool, 0644);
 MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes");
 
-static DEFINE_SPINLOCK(dev_list_lock);
 static LIST_HEAD(dev_list);
 static struct task_struct *nvme_thread;
 static struct workqueue_struct *nvme_workq;
 static wait_queue_head_t nvme_kthread_wait;
 
-static struct class *nvme_class;
+struct nvme_dev;
+struct nvme_queue;
 
-static int __nvme_reset(struct nvme_dev *dev);
 static int nvme_reset(struct nvme_dev *dev);
 static void nvme_process_cq(struct nvme_queue *nvmeq);
-static void nvme_dead_ctrl(struct nvme_dev *dev);
+static void nvme_remove_dead_ctrl(struct nvme_dev *dev);
+static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
 
-struct async_cmd_info {
-       struct kthread_work work;
-       struct kthread_worker *worker;
-       struct request *req;
-       u32 result;
-       int status;
-       void *ctx;
+/*
+ * Represents an NVM Express device.  Each nvme_dev is a PCI function.
+ */
+struct nvme_dev {
+       struct list_head node;
+       struct nvme_queue **queues;
+       struct blk_mq_tag_set tagset;
+       struct blk_mq_tag_set admin_tagset;
+       u32 __iomem *dbs;
+       struct device *dev;
+       struct dma_pool *prp_page_pool;
+       struct dma_pool *prp_small_pool;
+       unsigned queue_count;
+       unsigned online_queues;
+       unsigned max_qid;
+       int q_depth;
+       u32 db_stride;
+       struct msix_entry *entry;
+       void __iomem *bar;
+       struct work_struct reset_work;
+       struct work_struct scan_work;
+       struct work_struct remove_work;
+       struct mutex shutdown_lock;
+       bool subsystem;
+       void __iomem *cmb;
+       dma_addr_t cmb_dma_addr;
+       u64 cmb_size;
+       u32 cmbsz;
+       unsigned long flags;
+
+#define NVME_CTRL_RESETTING    0
+
+       struct nvme_ctrl ctrl;
+       struct completion ioq_wait;
 };
 
+static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
+{
+       return container_of(ctrl, struct nvme_dev, ctrl);
+}
+
 /*
  * An NVM Express queue.  Each device has at least two (one for admin
  * commands and one for I/O commands).
@@ -126,7 +154,24 @@ struct nvme_queue {
        u16 qid;
        u8 cq_phase;
        u8 cqe_seen;
-       struct async_cmd_info cmdinfo;
+};
+
+/*
+ * The nvme_iod describes the data in an I/O, including the list of PRP
+ * entries.  You can't see it in this data structure because C doesn't let
+ * me express that.  Use nvme_init_iod to ensure there's enough space
+ * allocated to store the PRP list.
+ */
+struct nvme_iod {
+       struct nvme_queue *nvmeq;
+       int aborted;
+       int npages;             /* In the PRP list. 0 means small pool in use */
+       int nents;              /* Used in scatterlist */
+       int length;             /* Of data, in bytes */
+       dma_addr_t first_dma;
+       struct scatterlist meta_sg; /* metadata requires single contiguous buffer */
+       struct scatterlist *sg;
+       struct scatterlist inline_sg[0];
 };
 
 /*
@@ -148,23 +193,11 @@ static inline void _nvme_check_size(void)
        BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
 }
 
-typedef void (*nvme_completion_fn)(struct nvme_queue *, void *,
-                                               struct nvme_completion *);
-
-struct nvme_cmd_info {
-       nvme_completion_fn fn;
-       void *ctx;
-       int aborted;
-       struct nvme_queue *nvmeq;
-       struct nvme_iod iod[0];
-};
-
 /*
  * Max size of iod being embedded in the request payload
  */
 #define NVME_INT_PAGES         2
-#define NVME_INT_BYTES(dev)    (NVME_INT_PAGES * (dev)->page_size)
-#define NVME_INT_MASK          0x01
+#define NVME_INT_BYTES(dev)    (NVME_INT_PAGES * (dev)->ctrl.page_size)
 
 /*
  * Will slightly overestimate the number of pages needed.  This is OK
@@ -173,19 +206,22 @@ struct nvme_cmd_info {
  */
 static int nvme_npages(unsigned size, struct nvme_dev *dev)
 {
-       unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size);
+       unsigned nprps = DIV_ROUND_UP(size + dev->ctrl.page_size,
+                                     dev->ctrl.page_size);
        return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
 }
 
-static unsigned int nvme_cmd_size(struct nvme_dev *dev)
+static unsigned int nvme_iod_alloc_size(struct nvme_dev *dev,
+               unsigned int size, unsigned int nseg)
 {
-       unsigned int ret = sizeof(struct nvme_cmd_info);
-
-       ret += sizeof(struct nvme_iod);
-       ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES(dev), dev);
-       ret += sizeof(struct scatterlist) * NVME_INT_PAGES;
+       return sizeof(__le64 *) * nvme_npages(size, dev) +
+                       sizeof(struct scatterlist) * nseg;
+}
 
-       return ret;
+static unsigned int nvme_cmd_size(struct nvme_dev *dev)
+{
+       return sizeof(struct nvme_iod) +
+               nvme_iod_alloc_size(dev, NVME_INT_BYTES(dev), NVME_INT_PAGES);
 }
 
 static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
@@ -215,11 +251,11 @@ static int nvme_admin_init_request(void *data, struct request *req,
                                unsigned int numa_node)
 {
        struct nvme_dev *dev = data;
-       struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
+       struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
        struct nvme_queue *nvmeq = dev->queues[0];
 
        BUG_ON(!nvmeq);
-       cmd->nvmeq = nvmeq;
+       iod->nvmeq = nvmeq;
        return 0;
 }
 
@@ -242,148 +278,36 @@ static int nvme_init_request(void *data, struct request *req,
                                unsigned int numa_node)
 {
        struct nvme_dev *dev = data;
-       struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
+       struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
        struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
 
        BUG_ON(!nvmeq);
-       cmd->nvmeq = nvmeq;
+       iod->nvmeq = nvmeq;
        return 0;
 }
 
-static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx,
-                               nvme_completion_fn handler)
+static void nvme_complete_async_event(struct nvme_dev *dev,
+               struct nvme_completion *cqe)
 {
-       cmd->fn = handler;
-       cmd->ctx = ctx;
-       cmd->aborted = 0;
-       blk_mq_start_request(blk_mq_rq_from_pdu(cmd));
-}
-
-static void *iod_get_private(struct nvme_iod *iod)
-{
-       return (void *) (iod->private & ~0x1UL);
-}
-
-/*
- * If bit 0 is set, the iod is embedded in the request payload.
- */
-static bool iod_should_kfree(struct nvme_iod *iod)
-{
-       return (iod->private & NVME_INT_MASK) == 0;
-}
-
-/* Special values must be less than 0x1000 */
-#define CMD_CTX_BASE           ((void *)POISON_POINTER_DELTA)
-#define CMD_CTX_CANCELLED      (0x30C + CMD_CTX_BASE)
-#define CMD_CTX_COMPLETED      (0x310 + CMD_CTX_BASE)
-#define CMD_CTX_INVALID                (0x314 + CMD_CTX_BASE)
-
-static void special_completion(struct nvme_queue *nvmeq, void *ctx,
-                                               struct nvme_completion *cqe)
-{
-       if (ctx == CMD_CTX_CANCELLED)
-               return;
-       if (ctx == CMD_CTX_COMPLETED) {
-               dev_warn(nvmeq->q_dmadev,
-                               "completed id %d twice on queue %d\n",
-                               cqe->command_id, le16_to_cpup(&cqe->sq_id));
-               return;
-       }
-       if (ctx == CMD_CTX_INVALID) {
-               dev_warn(nvmeq->q_dmadev,
-                               "invalid id %d completed on queue %d\n",
-                               cqe->command_id, le16_to_cpup(&cqe->sq_id));
-               return;
-       }
-       dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx);
-}
-
-static void *cancel_cmd_info(struct nvme_cmd_info *cmd, nvme_completion_fn *fn)
-{
-       void *ctx;
-
-       if (fn)
-               *fn = cmd->fn;
-       ctx = cmd->ctx;
-       cmd->fn = special_completion;
-       cmd->ctx = CMD_CTX_CANCELLED;
-       return ctx;
-}
-
-static void async_req_completion(struct nvme_queue *nvmeq, void *ctx,
-                                               struct nvme_completion *cqe)
-{
-       u32 result = le32_to_cpup(&cqe->result);
-       u16 status = le16_to_cpup(&cqe->status) >> 1;
+       u16 status = le16_to_cpu(cqe->status) >> 1;
+       u32 result = le32_to_cpu(cqe->result);
 
        if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ)
-               ++nvmeq->dev->event_limit;
+               ++dev->ctrl.event_limit;
        if (status != NVME_SC_SUCCESS)
                return;
 
        switch (result & 0xff07) {
        case NVME_AER_NOTICE_NS_CHANGED:
-               dev_info(nvmeq->q_dmadev, "rescanning\n");
-               schedule_work(&nvmeq->dev->scan_work);
+               dev_info(dev->dev, "rescanning\n");
+               queue_work(nvme_workq, &dev->scan_work);
        default:
-               dev_warn(nvmeq->q_dmadev, "async event result %08x\n", result);
-       }
-}
-
-static void abort_completion(struct nvme_queue *nvmeq, void *ctx,
-                                               struct nvme_completion *cqe)
-{
-       struct request *req = ctx;
-
-       u16 status = le16_to_cpup(&cqe->status) >> 1;
-       u32 result = le32_to_cpup(&cqe->result);
-
-       blk_mq_free_request(req);
-
-       dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result);
-       ++nvmeq->dev->abort_limit;
-}
-
-static void async_completion(struct nvme_queue *nvmeq, void *ctx,
-                                               struct nvme_completion *cqe)
-{
-       struct async_cmd_info *cmdinfo = ctx;
-       cmdinfo->result = le32_to_cpup(&cqe->result);
-       cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
-       queue_kthread_work(cmdinfo->worker, &cmdinfo->work);
-       blk_mq_free_request(cmdinfo->req);
-}
-
-static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq,
-                                 unsigned int tag)
-{
-       struct request *req = blk_mq_tag_to_rq(*nvmeq->tags, tag);
-
-       return blk_mq_rq_to_pdu(req);
-}
-
-/*
- * Called with local interrupts disabled and the q_lock held.  May not sleep.
- */
-static void *nvme_finish_cmd(struct nvme_queue *nvmeq, int tag,
-                                               nvme_completion_fn *fn)
-{
-       struct nvme_cmd_info *cmd = get_cmd_from_tag(nvmeq, tag);
-       void *ctx;
-       if (tag >= nvmeq->q_depth) {
-               *fn = special_completion;
-               return CMD_CTX_INVALID;
+               dev_warn(dev->dev, "async event result %08x\n", result);
        }
-       if (fn)
-               *fn = cmd->fn;
-       ctx = cmd->ctx;
-       cmd->fn = special_completion;
-       cmd->ctx = CMD_CTX_COMPLETED;
-       return ctx;
 }
 
 /**
- * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
+ * __nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
  * @nvmeq: The queue to use
  * @cmd: The command to send
  *
@@ -405,69 +329,44 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
        nvmeq->sq_tail = tail;
 }
 
-static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
-{
-       unsigned long flags;
-       spin_lock_irqsave(&nvmeq->q_lock, flags);
-       __nvme_submit_cmd(nvmeq, cmd);
-       spin_unlock_irqrestore(&nvmeq->q_lock, flags);
-}
-
-static __le64 **iod_list(struct nvme_iod *iod)
-{
-       return ((void *)iod) + iod->offset;
-}
-
-static inline void iod_init(struct nvme_iod *iod, unsigned nbytes,
-                           unsigned nseg, unsigned long private)
-{
-       iod->private = private;
-       iod->offset = offsetof(struct nvme_iod, sg[nseg]);
-       iod->npages = -1;
-       iod->length = nbytes;
-       iod->nents = 0;
-}
-
-static struct nvme_iod *
-__nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev,
-                unsigned long priv, gfp_t gfp)
+static __le64 **iod_list(struct request *req)
 {
-       struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) +
-                               sizeof(__le64 *) * nvme_npages(bytes, dev) +
-                               sizeof(struct scatterlist) * nseg, gfp);
-
-       if (iod)
-               iod_init(iod, bytes, nseg, priv);
-
-       return iod;
+       struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+       return (__le64 **)(iod->sg + req->nr_phys_segments);
 }
 
-static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev,
-                                      gfp_t gfp)
+static int nvme_init_iod(struct request *rq, struct nvme_dev *dev)
 {
-       unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) :
-                                                sizeof(struct nvme_dsm_range);
-       struct nvme_iod *iod;
+       struct nvme_iod *iod = blk_mq_rq_to_pdu(rq);
+       int nseg = rq->nr_phys_segments;
+       unsigned size;
 
-       if (rq->nr_phys_segments <= NVME_INT_PAGES &&
-           size <= NVME_INT_BYTES(dev)) {
-               struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq);
+       if (rq->cmd_flags & REQ_DISCARD)
+               size = sizeof(struct nvme_dsm_range);
+       else
+               size = blk_rq_bytes(rq);
 
-               iod = cmd->iod;
-               iod_init(iod, size, rq->nr_phys_segments,
-                               (unsigned long) rq | NVME_INT_MASK);
-               return iod;
+       if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) {
+               iod->sg = kmalloc(nvme_iod_alloc_size(dev, size, nseg), GFP_ATOMIC);
+               if (!iod->sg)
+                       return BLK_MQ_RQ_QUEUE_BUSY;
+       } else {
+               iod->sg = iod->inline_sg;
        }
 
-       return __nvme_alloc_iod(rq->nr_phys_segments, size, dev,
-                               (unsigned long) rq, gfp);
+       iod->aborted = 0;
+       iod->npages = -1;
+       iod->nents = 0;
+       iod->length = size;
+       return 0;
 }
 
-static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
+static void nvme_free_iod(struct nvme_dev *dev, struct request *req)
 {
-       const int last_prp = dev->page_size / 8 - 1;
+       struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+       const int last_prp = dev->ctrl.page_size / 8 - 1;
        int i;
-       __le64 **list = iod_list(iod);
+       __le64 **list = iod_list(req);
        dma_addr_t prp_dma = iod->first_dma;
 
        if (iod->npages == 0)
@@ -479,20 +378,8 @@ static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
                prp_dma = next_prp_dma;
        }
 
-       if (iod_should_kfree(iod))
-               kfree(iod);
-}
-
-static int nvme_error_status(u16 status)
-{
-       switch (status & 0x7ff) {
-       case NVME_SC_SUCCESS:
-               return 0;
-       case NVME_SC_CAP_EXCEEDED:
-               return -ENOSPC;
-       default:
-               return -EIO;
-       }
+       if (iod->sg != iod->inline_sg)
+               kfree(iod->sg);
 }
 
 #ifdef CONFIG_BLK_DEV_INTEGRITY
@@ -549,27 +436,6 @@ static void nvme_dif_remap(struct request *req,
        }
        kunmap_atomic(pmap);
 }
-
-static void nvme_init_integrity(struct nvme_ns *ns)
-{
-       struct blk_integrity integrity;
-
-       switch (ns->pi_type) {
-       case NVME_NS_DPS_PI_TYPE3:
-               integrity.profile = &t10_pi_type3_crc;
-               break;
-       case NVME_NS_DPS_PI_TYPE1:
-       case NVME_NS_DPS_PI_TYPE2:
-               integrity.profile = &t10_pi_type1_crc;
-               break;
-       default:
-               integrity.profile = NULL;
-               break;
-       }
-       integrity.tuple_size = ns->ms;
-       blk_integrity_register(ns->disk, &integrity);
-       blk_queue_max_integrity_segments(ns->queue, 1);
-}
 #else /* CONFIG_BLK_DEV_INTEGRITY */
 static void nvme_dif_remap(struct request *req,
                        void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi))
@@ -581,91 +447,27 @@ static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi)
 static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi)
 {
 }
-static void nvme_init_integrity(struct nvme_ns *ns)
-{
-}
 #endif
 
-static void req_completion(struct nvme_queue *nvmeq, void *ctx,
-                                               struct nvme_completion *cqe)
-{
-       struct nvme_iod *iod = ctx;
-       struct request *req = iod_get_private(iod);
-       struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
-       u16 status = le16_to_cpup(&cqe->status) >> 1;
-       bool requeue = false;
-       int error = 0;
-
-       if (unlikely(status)) {
-               if (!(status & NVME_SC_DNR || blk_noretry_request(req))
-                   && (jiffies - req->start_time) < req->timeout) {
-                       unsigned long flags;
-
-                       requeue = true;
-                       blk_mq_requeue_request(req);
-                       spin_lock_irqsave(req->q->queue_lock, flags);
-                       if (!blk_queue_stopped(req->q))
-                               blk_mq_kick_requeue_list(req->q);
-                       spin_unlock_irqrestore(req->q->queue_lock, flags);
-                       goto release_iod;
-               }
-
-               if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
-                       if (cmd_rq->ctx == CMD_CTX_CANCELLED)
-                               error = -EINTR;
-                       else
-                               error = status;
-               } else {
-                       error = nvme_error_status(status);
-               }
-       }
-
-       if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
-               u32 result = le32_to_cpup(&cqe->result);
-               req->special = (void *)(uintptr_t)result;
-       }
-
-       if (cmd_rq->aborted)
-               dev_warn(nvmeq->dev->dev,
-                       "completing aborted command with status:%04x\n",
-                       error);
-
-release_iod:
-       if (iod->nents) {
-               dma_unmap_sg(nvmeq->dev->dev, iod->sg, iod->nents,
-                       rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-               if (blk_integrity_rq(req)) {
-                       if (!rq_data_dir(req))
-                               nvme_dif_remap(req, nvme_dif_complete);
-                       dma_unmap_sg(nvmeq->dev->dev, iod->meta_sg, 1,
-                               rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-               }
-       }
-       nvme_free_iod(nvmeq->dev, iod);
-
-       if (likely(!requeue))
-               blk_mq_complete_request(req, error);
-}
-
-/* length is in bytes.  gfp flags indicates whether we may sleep. */
-static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod,
-               int total_len, gfp_t gfp)
+static bool nvme_setup_prps(struct nvme_dev *dev, struct request *req,
+               int total_len)
 {
+       struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
        struct dma_pool *pool;
        int length = total_len;
        struct scatterlist *sg = iod->sg;
        int dma_len = sg_dma_len(sg);
        u64 dma_addr = sg_dma_address(sg);
-       u32 page_size = dev->page_size;
+       u32 page_size = dev->ctrl.page_size;
        int offset = dma_addr & (page_size - 1);
        __le64 *prp_list;
-       __le64 **list = iod_list(iod);
+       __le64 **list = iod_list(req);
        dma_addr_t prp_dma;
        int nprps, i;
 
        length -= (page_size - offset);
        if (length <= 0)
-               return total_len;
+               return true;
 
        dma_len -= (page_size - offset);
        if (dma_len) {
@@ -678,7 +480,7 @@ static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod,
 
        if (length <= page_size) {
                iod->first_dma = dma_addr;
-               return total_len;
+               return true;
        }
 
        nprps = DIV_ROUND_UP(length, page_size);
@@ -690,11 +492,11 @@ static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod,
                iod->npages = 1;
        }
 
-       prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
+       prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
        if (!prp_list) {
                iod->first_dma = dma_addr;
                iod->npages = -1;
-               return (total_len - length) + page_size;
+               return false;
        }
        list[0] = prp_list;
        iod->first_dma = prp_dma;
@@ -702,9 +504,9 @@ static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod,
        for (;;) {
                if (i == page_size >> 3) {
                        __le64 *old_prp_list = prp_list;
-                       prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
+                       prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
                        if (!prp_list)
-                               return total_len - length;
+                               return false;
                        list[iod->npages++] = prp_list;
                        prp_list[0] = old_prp_list[i - 1];
                        old_prp_list[i - 1] = cpu_to_le64(prp_dma);
@@ -724,115 +526,105 @@ static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod,
                dma_len = sg_dma_len(sg);
        }
 
-       return total_len;
+       return true;
 }
 
-static void nvme_submit_priv(struct nvme_queue *nvmeq, struct request *req,
-               struct nvme_iod *iod)
+static int nvme_map_data(struct nvme_dev *dev, struct request *req,
+               struct nvme_command *cmnd)
 {
-       struct nvme_command cmnd;
+       struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+       struct request_queue *q = req->q;
+       enum dma_data_direction dma_dir = rq_data_dir(req) ?
+                       DMA_TO_DEVICE : DMA_FROM_DEVICE;
+       int ret = BLK_MQ_RQ_QUEUE_ERROR;
 
-       memcpy(&cmnd, req->cmd, sizeof(cmnd));
-       cmnd.rw.command_id = req->tag;
-       if (req->nr_phys_segments) {
-               cmnd.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
-               cmnd.rw.prp2 = cpu_to_le64(iod->first_dma);
-       }
+       sg_init_table(iod->sg, req->nr_phys_segments);
+       iod->nents = blk_rq_map_sg(q, req, iod->sg);
+       if (!iod->nents)
+               goto out;
 
-       __nvme_submit_cmd(nvmeq, &cmnd);
-}
+       ret = BLK_MQ_RQ_QUEUE_BUSY;
+       if (!dma_map_sg(dev->dev, iod->sg, iod->nents, dma_dir))
+               goto out;
 
-/*
- * We reuse the small pool to allocate the 16-byte range here as it is not
- * worth having a special pool for these or additional cases to handle freeing
- * the iod.
- */
-static void nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
-               struct request *req, struct nvme_iod *iod)
-{
-       struct nvme_dsm_range *range =
-                               (struct nvme_dsm_range *)iod_list(iod)[0];
-       struct nvme_command cmnd;
+       if (!nvme_setup_prps(dev, req, blk_rq_bytes(req)))
+               goto out_unmap;
 
-       range->cattr = cpu_to_le32(0);
-       range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift);
-       range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
+       ret = BLK_MQ_RQ_QUEUE_ERROR;
+       if (blk_integrity_rq(req)) {
+               if (blk_rq_count_integrity_sg(q, req->bio) != 1)
+                       goto out_unmap;
 
-       memset(&cmnd, 0, sizeof(cmnd));
-       cmnd.dsm.opcode = nvme_cmd_dsm;
-       cmnd.dsm.command_id = req->tag;
-       cmnd.dsm.nsid = cpu_to_le32(ns->ns_id);
-       cmnd.dsm.prp1 = cpu_to_le64(iod->first_dma);
-       cmnd.dsm.nr = 0;
-       cmnd.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
+               sg_init_table(&iod->meta_sg, 1);
+               if (blk_rq_map_integrity_sg(q, req->bio, &iod->meta_sg) != 1)
+                       goto out_unmap;
 
-       __nvme_submit_cmd(nvmeq, &cmnd);
-}
+               if (rq_data_dir(req))
+                       nvme_dif_remap(req, nvme_dif_prep);
 
-static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
-                                                               int cmdid)
-{
-       struct nvme_command cmnd;
+               if (!dma_map_sg(dev->dev, &iod->meta_sg, 1, dma_dir))
+                       goto out_unmap;
+       }
 
-       memset(&cmnd, 0, sizeof(cmnd));
-       cmnd.common.opcode = nvme_cmd_flush;
-       cmnd.common.command_id = cmdid;
-       cmnd.common.nsid = cpu_to_le32(ns->ns_id);
+       cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+       cmnd->rw.prp2 = cpu_to_le64(iod->first_dma);
+       if (blk_integrity_rq(req))
+               cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg));
+       return BLK_MQ_RQ_QUEUE_OK;
 
-       __nvme_submit_cmd(nvmeq, &cmnd);
+out_unmap:
+       dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
+out:
+       return ret;
 }
 
-static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
-                                                       struct nvme_ns *ns)
+static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
 {
-       struct request *req = iod_get_private(iod);
-       struct nvme_command cmnd;
-       u16 control = 0;
-       u32 dsmgmt = 0;
-
-       if (req->cmd_flags & REQ_FUA)
-               control |= NVME_RW_FUA;
-       if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
-               control |= NVME_RW_LR;
-
-       if (req->cmd_flags & REQ_RAHEAD)
-               dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
-
-       memset(&cmnd, 0, sizeof(cmnd));
-       cmnd.rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
-       cmnd.rw.command_id = req->tag;
-       cmnd.rw.nsid = cpu_to_le32(ns->ns_id);
-       cmnd.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
-       cmnd.rw.prp2 = cpu_to_le64(iod->first_dma);
-       cmnd.rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
-       cmnd.rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
-
-       if (ns->ms) {
-               switch (ns->pi_type) {
-               case NVME_NS_DPS_PI_TYPE3:
-                       control |= NVME_RW_PRINFO_PRCHK_GUARD;
-                       break;
-               case NVME_NS_DPS_PI_TYPE1:
-               case NVME_NS_DPS_PI_TYPE2:
-                       control |= NVME_RW_PRINFO_PRCHK_GUARD |
-                                       NVME_RW_PRINFO_PRCHK_REF;
-                       cmnd.rw.reftag = cpu_to_le32(
-                                       nvme_block_nr(ns, blk_rq_pos(req)));
-                       break;
+       struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+       enum dma_data_direction dma_dir = rq_data_dir(req) ?
+                       DMA_TO_DEVICE : DMA_FROM_DEVICE;
+
+       if (iod->nents) {
+               dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
+               if (blk_integrity_rq(req)) {
+                       if (!rq_data_dir(req))
+                               nvme_dif_remap(req, nvme_dif_complete);
+                       dma_unmap_sg(dev->dev, &iod->meta_sg, 1, dma_dir);
                }
-               if (blk_integrity_rq(req))
-                       cmnd.rw.metadata =
-                               cpu_to_le64(sg_dma_address(iod->meta_sg));
-               else
-                       control |= NVME_RW_PRINFO_PRACT;
        }
 
-       cmnd.rw.control = cpu_to_le16(control);
-       cmnd.rw.dsmgmt = cpu_to_le32(dsmgmt);
+       nvme_free_iod(dev, req);
+}
 
-       __nvme_submit_cmd(nvmeq, &cmnd);
+/*
+ * We reuse the small pool to allocate the 16-byte range here as it is not
+ * worth having a special pool for these or additional cases to handle freeing
+ * the iod.
+ */
+static int nvme_setup_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
+               struct request *req, struct nvme_command *cmnd)
+{
+       struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+       struct nvme_dsm_range *range;
 
-       return 0;
+       range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC,
+                                               &iod->first_dma);
+       if (!range)
+               return BLK_MQ_RQ_QUEUE_BUSY;
+       iod_list(req)[0] = (__le64 *)range;
+       iod->npages = 0;
+
+       range->cattr = cpu_to_le32(0);
+       range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift);
+       range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
+
+       memset(cmnd, 0, sizeof(*cmnd));
+       cmnd->dsm.opcode = nvme_cmd_dsm;
+       cmnd->dsm.nsid = cpu_to_le32(ns->ns_id);
+       cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma);
+       cmnd->dsm.nr = 0;
+       cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
+       return BLK_MQ_RQ_QUEUE_OK;
 }
 
 /*
@@ -845,9 +637,8 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
        struct nvme_queue *nvmeq = hctx->driver_data;
        struct nvme_dev *dev = nvmeq->dev;
        struct request *req = bd->rq;
-       struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
-       struct nvme_iod *iod;
-       enum dma_data_direction dma_dir;
+       struct nvme_command cmnd;
+       int ret = BLK_MQ_RQ_QUEUE_OK;
 
        /*
         * If formated with metadata, require the block layer provide a buffer
@@ -857,91 +648,72 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
        if (ns && ns->ms && !blk_integrity_rq(req)) {
                if (!(ns->pi_type && ns->ms == 8) &&
                                        req->cmd_type != REQ_TYPE_DRV_PRIV) {
-                       blk_mq_complete_request(req, -EFAULT);
+                       blk_mq_end_request(req, -EFAULT);
                        return BLK_MQ_RQ_QUEUE_OK;
                }
        }
 
-       iod = nvme_alloc_iod(req, dev, GFP_ATOMIC);
-       if (!iod)
-               return BLK_MQ_RQ_QUEUE_BUSY;
+       ret = nvme_init_iod(req, dev);
+       if (ret)
+               return ret;
 
        if (req->cmd_flags & REQ_DISCARD) {
-               void *range;
-               /*
-                * We reuse the small pool to allocate the 16-byte range here
-                * as it is not worth having a special pool for these or
-                * additional cases to handle freeing the iod.
-                */
-               range = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC,
-                                               &iod->first_dma);
-               if (!range)
-                       goto retry_cmd;
-               iod_list(iod)[0] = (__le64 *)range;
-               iod->npages = 0;
-       } else if (req->nr_phys_segments) {
-               dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
+               ret = nvme_setup_discard(nvmeq, ns, req, &cmnd);
+       } else {
+               if (req->cmd_type == REQ_TYPE_DRV_PRIV)
+                       memcpy(&cmnd, req->cmd, sizeof(cmnd));
+               else if (req->cmd_flags & REQ_FLUSH)
+                       nvme_setup_flush(ns, &cmnd);
+               else
+                       nvme_setup_rw(ns, req, &cmnd);
+
+               if (req->nr_phys_segments)
+                       ret = nvme_map_data(dev, req, &cmnd);
+       }
 
-               sg_init_table(iod->sg, req->nr_phys_segments);
-               iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
-               if (!iod->nents)
-                       goto error_cmd;
+       if (ret)
+               goto out;
 
-               if (!dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir))
-                       goto retry_cmd;
+       cmnd.common.command_id = req->tag;
+       blk_mq_start_request(req);
 
-               if (blk_rq_bytes(req) !=
-                    nvme_setup_prps(dev, iod, blk_rq_bytes(req), GFP_ATOMIC)) {
-                       dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
-                       goto retry_cmd;
-               }
-               if (blk_integrity_rq(req)) {
-                       if (blk_rq_count_integrity_sg(req->q, req->bio) != 1) {
-                               dma_unmap_sg(dev->dev, iod->sg, iod->nents,
-                                               dma_dir);
-                               goto error_cmd;
-                       }
+       spin_lock_irq(&nvmeq->q_lock);
+       __nvme_submit_cmd(nvmeq, &cmnd);
+       nvme_process_cq(nvmeq);
+       spin_unlock_irq(&nvmeq->q_lock);
+       return BLK_MQ_RQ_QUEUE_OK;
+out:
+       nvme_free_iod(dev, req);
+       return ret;
+}
 
-                       sg_init_table(iod->meta_sg, 1);
-                       if (blk_rq_map_integrity_sg(
-                                       req->q, req->bio, iod->meta_sg) != 1) {
-                               dma_unmap_sg(dev->dev, iod->sg, iod->nents,
-                                               dma_dir);
-                               goto error_cmd;
-                       }
+static void nvme_complete_rq(struct request *req)
+{
+       struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+       struct nvme_dev *dev = iod->nvmeq->dev;
+       int error = 0;
 
-                       if (rq_data_dir(req))
-                               nvme_dif_remap(req, nvme_dif_prep);
+       nvme_unmap_data(dev, req);
 
-                       if (!dma_map_sg(nvmeq->q_dmadev, iod->meta_sg, 1, dma_dir)) {
-                               dma_unmap_sg(dev->dev, iod->sg, iod->nents,
-                                               dma_dir);
-                               goto error_cmd;
-                       }
+       if (unlikely(req->errors)) {
+               if (nvme_req_needs_retry(req, req->errors)) {
+                       nvme_requeue_req(req);
+                       return;
                }
-       }
 
-       nvme_set_info(cmd, iod, req_completion);
-       spin_lock_irq(&nvmeq->q_lock);
-       if (req->cmd_type == REQ_TYPE_DRV_PRIV)
-               nvme_submit_priv(nvmeq, req, iod);
-       else if (req->cmd_flags & REQ_DISCARD)
-               nvme_submit_discard(nvmeq, ns, req, iod);
-       else if (req->cmd_flags & REQ_FLUSH)
-               nvme_submit_flush(nvmeq, ns, req->tag);
-       else
-               nvme_submit_iod(nvmeq, iod, ns);
+               if (req->cmd_type == REQ_TYPE_DRV_PRIV)
+                       error = req->errors;
+               else
+                       error = nvme_error_status(req->errors);
+       }
 
-       nvme_process_cq(nvmeq);
-       spin_unlock_irq(&nvmeq->q_lock);
-       return BLK_MQ_RQ_QUEUE_OK;
+       if (unlikely(iod->aborted)) {
+               dev_warn(dev->dev,
+                       "completing aborted command with status: %04x\n",
+                       req->errors);
+       }
 
- error_cmd:
-       nvme_free_iod(dev, iod);
-       return BLK_MQ_RQ_QUEUE_ERROR;
- retry_cmd:
-       nvme_free_iod(dev, iod);
-       return BLK_MQ_RQ_QUEUE_BUSY;
+       blk_mq_end_request(req, error);
 }
 
 static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
@@ -952,20 +724,47 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
        phase = nvmeq->cq_phase;
 
        for (;;) {
-               void *ctx;
-               nvme_completion_fn fn;
                struct nvme_completion cqe = nvmeq->cqes[head];
-               if ((le16_to_cpu(cqe.status) & 1) != phase)
+               u16 status = le16_to_cpu(cqe.status);
+               struct request *req;
+
+               if ((status & 1) != phase)
                        break;
                nvmeq->sq_head = le16_to_cpu(cqe.sq_head);
                if (++head == nvmeq->q_depth) {
                        head = 0;
                        phase = !phase;
                }
+
                if (tag && *tag == cqe.command_id)
                        *tag = -1;
-               ctx = nvme_finish_cmd(nvmeq, cqe.command_id, &fn);
-               fn(nvmeq, ctx, &cqe);
+
+               if (unlikely(cqe.command_id >= nvmeq->q_depth)) {
+                       dev_warn(nvmeq->q_dmadev,
+                               "invalid id %d completed on queue %d\n",
+                               cqe.command_id, le16_to_cpu(cqe.sq_id));
+                       continue;
+               }
+
+               /*
+                * AEN requests are special as they don't time out and can
+                * survive any kind of queue freeze and often don't respond to
+                * aborts.  We don't even bother to allocate a struct request
+                * for them but rather special case them here.
+                */
+               if (unlikely(nvmeq->qid == 0 &&
+                               cqe.command_id >= NVME_AQ_BLKMQ_DEPTH)) {
+                       nvme_complete_async_event(nvmeq->dev, &cqe);
+                       continue;
+               }
+
+               req = blk_mq_tag_to_rq(*nvmeq->tags, cqe.command_id);
+               if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
+                       u32 result = le32_to_cpu(cqe.result);
+                       req->special = (void *)(uintptr_t)result;
+               }
+               blk_mq_complete_request(req, status >> 1);
+
        }
 
        /* If the controller ignores the cq head doorbell and continuously
@@ -1028,127 +827,30 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
        return 0;
 }
 
-/*
- * Returns 0 on success.  If the result is negative, it's a Linux error code;
- * if the result is positive, it's an NVM Express status code
- */
-int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
-               void *buffer, void __user *ubuffer, unsigned bufflen,
-               u32 *result, unsigned timeout)
+static void nvme_submit_async_event(struct nvme_dev *dev)
 {
-       bool write = cmd->common.opcode & 1;
-       struct bio *bio = NULL;
-       struct request *req;
-       int ret;
-
-       req = blk_mq_alloc_request(q, write, 0);
-       if (IS_ERR(req))
-               return PTR_ERR(req);
+       struct nvme_command c;
 
-       req->cmd_type = REQ_TYPE_DRV_PRIV;
-       req->cmd_flags |= REQ_FAILFAST_DRIVER;
-       req->__data_len = 0;
-       req->__sector = (sector_t) -1;
-       req->bio = req->biotail = NULL;
-
-       req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
-
-       req->cmd = (unsigned char *)cmd;
-       req->cmd_len = sizeof(struct nvme_command);
-       req->special = (void *)0;
-
-       if (buffer && bufflen) {
-               ret = blk_rq_map_kern(q, req, buffer, bufflen,
-                                     __GFP_DIRECT_RECLAIM);
-               if (ret)
-                       goto out;
-       } else if (ubuffer && bufflen) {
-               ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
-                                     __GFP_DIRECT_RECLAIM);
-               if (ret)
-                       goto out;
-               bio = req->bio;
-       }
+       memset(&c, 0, sizeof(c));
+       c.common.opcode = nvme_admin_async_event;
+       c.common.command_id = NVME_AQ_BLKMQ_DEPTH + --dev->ctrl.event_limit;
 
-       blk_execute_rq(req->q, NULL, req, 0);
-       if (bio)
-               blk_rq_unmap_user(bio);
-       if (result)
-               *result = (u32)(uintptr_t)req->special;
-       ret = req->errors;
- out:
-       blk_mq_free_request(req);
-       return ret;
+       __nvme_submit_cmd(dev->queues[0], &c);
 }
 
-int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
-               void *buffer, unsigned bufflen)
+static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
 {
-       return __nvme_submit_sync_cmd(q, cmd, buffer, NULL, bufflen, NULL, 0);
+       struct nvme_command c;
+
+       memset(&c, 0, sizeof(c));
+       c.delete_queue.opcode = opcode;
+       c.delete_queue.qid = cpu_to_le16(id);
+
+       return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
 }
 
-static int nvme_submit_async_admin_req(struct nvme_dev *dev)
-{
-       struct nvme_queue *nvmeq = dev->queues[0];
-       struct nvme_command c;
-       struct nvme_cmd_info *cmd_info;
-       struct request *req;
-
-       req = blk_mq_alloc_request(dev->admin_q, WRITE,
-                       BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED);
-       if (IS_ERR(req))
-               return PTR_ERR(req);
-
-       req->cmd_flags |= REQ_NO_TIMEOUT;
-       cmd_info = blk_mq_rq_to_pdu(req);
-       nvme_set_info(cmd_info, NULL, async_req_completion);
-
-       memset(&c, 0, sizeof(c));
-       c.common.opcode = nvme_admin_async_event;
-       c.common.command_id = req->tag;
-
-       blk_mq_free_request(req);
-       __nvme_submit_cmd(nvmeq, &c);
-       return 0;
-}
-
-static int nvme_submit_admin_async_cmd(struct nvme_dev *dev,
-                       struct nvme_command *cmd,
-                       struct async_cmd_info *cmdinfo, unsigned timeout)
-{
-       struct nvme_queue *nvmeq = dev->queues[0];
-       struct request *req;
-       struct nvme_cmd_info *cmd_rq;
-
-       req = blk_mq_alloc_request(dev->admin_q, WRITE, 0);
-       if (IS_ERR(req))
-               return PTR_ERR(req);
-
-       req->timeout = timeout;
-       cmd_rq = blk_mq_rq_to_pdu(req);
-       cmdinfo->req = req;
-       nvme_set_info(cmd_rq, cmdinfo, async_completion);
-       cmdinfo->status = -EINTR;
-
-       cmd->common.command_id = req->tag;
-
-       nvme_submit_cmd(nvmeq, cmd);
-       return 0;
-}
-
-static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
-{
-       struct nvme_command c;
-
-       memset(&c, 0, sizeof(c));
-       c.delete_queue.opcode = opcode;
-       c.delete_queue.qid = cpu_to_le16(id);
-
-       return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0);
-}
-
-static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
-                                               struct nvme_queue *nvmeq)
+static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
+                                               struct nvme_queue *nvmeq)
 {
        struct nvme_command c;
        int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
@@ -1165,7 +867,7 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
        c.create_cq.cq_flags = cpu_to_le16(flags);
        c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
 
-       return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0);
+       return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
 }
 
 static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
@@ -1186,7 +888,7 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
        c.create_sq.sq_flags = cpu_to_le16(flags);
        c.create_sq.cqid = cpu_to_le16(qid);
 
-       return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0);
+       return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
 }
 
 static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
@@ -1199,195 +901,111 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
        return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
 }
 
-int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id)
+static void abort_endio(struct request *req, int error)
 {
-       struct nvme_command c = { };
-       int error;
-
-       /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
-       c.identify.opcode = nvme_admin_identify;
-       c.identify.cns = cpu_to_le32(1);
-
-       *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
-       if (!*id)
-               return -ENOMEM;
-
-       error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
-                       sizeof(struct nvme_id_ctrl));
-       if (error)
-               kfree(*id);
-       return error;
-}
-
-int nvme_identify_ns(struct nvme_dev *dev, unsigned nsid,
-               struct nvme_id_ns **id)
-{
-       struct nvme_command c = { };
-       int error;
-
-       /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
-       c.identify.opcode = nvme_admin_identify,
-       c.identify.nsid = cpu_to_le32(nsid),
+       struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+       struct nvme_queue *nvmeq = iod->nvmeq;
+       u32 result = (u32)(uintptr_t)req->special;
+       u16 status = req->errors;
 
-       *id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL);
-       if (!*id)
-               return -ENOMEM;
-
-       error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
-                       sizeof(struct nvme_id_ns));
-       if (error)
-               kfree(*id);
-       return error;
-}
-
-int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid,
-                                       dma_addr_t dma_addr, u32 *result)
-{
-       struct nvme_command c;
-
-       memset(&c, 0, sizeof(c));
-       c.features.opcode = nvme_admin_get_features;
-       c.features.nsid = cpu_to_le32(nsid);
-       c.features.prp1 = cpu_to_le64(dma_addr);
-       c.features.fid = cpu_to_le32(fid);
-
-       return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0,
-                       result, 0);
-}
-
-int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11,
-                                       dma_addr_t dma_addr, u32 *result)
-{
-       struct nvme_command c;
-
-       memset(&c, 0, sizeof(c));
-       c.features.opcode = nvme_admin_set_features;
-       c.features.prp1 = cpu_to_le64(dma_addr);
-       c.features.fid = cpu_to_le32(fid);
-       c.features.dword11 = cpu_to_le32(dword11);
-
-       return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0,
-                       result, 0);
-}
-
-int nvme_get_log_page(struct nvme_dev *dev, struct nvme_smart_log **log)
-{
-       struct nvme_command c = { };
-       int error;
-
-       c.common.opcode = nvme_admin_get_log_page,
-       c.common.nsid = cpu_to_le32(0xFFFFFFFF),
-       c.common.cdw10[0] = cpu_to_le32(
-                       (((sizeof(struct nvme_smart_log) / 4) - 1) << 16) |
-                        NVME_LOG_SMART),
-
-       *log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL);
-       if (!*log)
-               return -ENOMEM;
+       dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result);
+       atomic_inc(&nvmeq->dev->ctrl.abort_limit);
 
-       error = nvme_submit_sync_cmd(dev->admin_q, &c, *log,
-                       sizeof(struct nvme_smart_log));
-       if (error)
-               kfree(*log);
-       return error;
+       blk_mq_free_request(req);
 }
 
-/**
- * nvme_abort_req - Attempt aborting a request
- *
- * Schedule controller reset if the command was already aborted once before and
- * still hasn't been returned to the driver, or if this is the admin queue.
- */
-static void nvme_abort_req(struct request *req)
+static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 {
-       struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
-       struct nvme_queue *nvmeq = cmd_rq->nvmeq;
+       struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+       struct nvme_queue *nvmeq = iod->nvmeq;
        struct nvme_dev *dev = nvmeq->dev;
        struct request *abort_req;
-       struct nvme_cmd_info *abort_cmd;
        struct nvme_command cmd;
 
-       if (!nvmeq->qid || cmd_rq->aborted) {
-               spin_lock(&dev_list_lock);
-               if (!__nvme_reset(dev)) {
-                       dev_warn(dev->dev,
-                                "I/O %d QID %d timeout, reset controller\n",
-                                req->tag, nvmeq->qid);
-               }
-               spin_unlock(&dev_list_lock);
-               return;
+       /*
+        * Shutdown immediately if controller times out while starting. The
+        * reset work will see the pci device disabled when it gets the forced
+        * cancellation error. All outstanding requests are completed on
+        * shutdown, so we return BLK_EH_HANDLED.
+        */
+       if (test_bit(NVME_CTRL_RESETTING, &dev->flags)) {
+               dev_warn(dev->dev,
+                        "I/O %d QID %d timeout, disable controller\n",
+                        req->tag, nvmeq->qid);
+               nvme_dev_disable(dev, false);
+               req->errors = NVME_SC_CANCELLED;
+               return BLK_EH_HANDLED;
        }
 
-       if (!dev->abort_limit)
-               return;
+       /*
+        * Shutdown the controller immediately and schedule a reset if the
+        * command was already aborted once before and still hasn't been
+        * returned to the driver, or if this is the admin queue.
+        */
+       if (!nvmeq->qid || iod->aborted) {
+               dev_warn(dev->dev,
+                        "I/O %d QID %d timeout, reset controller\n",
+                        req->tag, nvmeq->qid);
+               nvme_dev_disable(dev, false);
+               queue_work(nvme_workq, &dev->reset_work);
 
-       abort_req = blk_mq_alloc_request(dev->admin_q, WRITE,
-                       BLK_MQ_REQ_NOWAIT);
-       if (IS_ERR(abort_req))
-               return;
+               /*
+                * Mark the request as handled, since the inline shutdown
+                * forces all outstanding requests to complete.
+                */
+               req->errors = NVME_SC_CANCELLED;
+               return BLK_EH_HANDLED;
+       }
 
-       abort_cmd = blk_mq_rq_to_pdu(abort_req);
-       nvme_set_info(abort_cmd, abort_req, abort_completion);
+       iod->aborted = 1;
+
+       if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) {
+               atomic_inc(&dev->ctrl.abort_limit);
+               return BLK_EH_RESET_TIMER;
+       }
 
        memset(&cmd, 0, sizeof(cmd));
        cmd.abort.opcode = nvme_admin_abort_cmd;
        cmd.abort.cid = req->tag;
        cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
-       cmd.abort.command_id = abort_req->tag;
 
-       --dev->abort_limit;
-       cmd_rq->aborted = 1;
+       dev_warn(nvmeq->q_dmadev, "I/O %d QID %d timeout, aborting\n",
+                                req->tag, nvmeq->qid);
+
+       abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd,
+                       BLK_MQ_REQ_NOWAIT);
+       if (IS_ERR(abort_req)) {
+               atomic_inc(&dev->ctrl.abort_limit);
+               return BLK_EH_RESET_TIMER;
+       }
+
+       abort_req->timeout = ADMIN_TIMEOUT;
+       abort_req->end_io_data = NULL;
+       blk_execute_rq_nowait(abort_req->q, NULL, abort_req, 0, abort_endio);
 
-       dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", req->tag,
-                                                       nvmeq->qid);
-       nvme_submit_cmd(dev->queues[0], &cmd);
+       /*
+        * The aborted req will be completed on receiving the abort req.
+        * We enable the timer again. If hit twice, it'll cause a device reset,
+        * as the device then is in a faulty state.
+        */
+       return BLK_EH_RESET_TIMER;
 }
 
 static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved)
 {
        struct nvme_queue *nvmeq = data;
-       void *ctx;
-       nvme_completion_fn fn;
-       struct nvme_cmd_info *cmd;
-       struct nvme_completion cqe;
+       int status;
 
        if (!blk_mq_request_started(req))
                return;
 
-       cmd = blk_mq_rq_to_pdu(req);
-
-       if (cmd->ctx == CMD_CTX_CANCELLED)
-               return;
+       dev_warn(nvmeq->q_dmadev,
+                "Cancelling I/O %d QID %d\n", req->tag, nvmeq->qid);
 
+       status = NVME_SC_ABORT_REQ;
        if (blk_queue_dying(req->q))
-               cqe.status = cpu_to_le16((NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1);
-       else
-               cqe.status = cpu_to_le16(NVME_SC_ABORT_REQ << 1);
-
-
-       dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n",
-                                               req->tag, nvmeq->qid);
-       ctx = cancel_cmd_info(cmd, &fn);
-       fn(nvmeq, ctx, &cqe);
-}
-
-static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
-{
-       struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
-       struct nvme_queue *nvmeq = cmd->nvmeq;
-
-       dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag,
-                                                       nvmeq->qid);
-       spin_lock_irq(&nvmeq->q_lock);
-       nvme_abort_req(req);
-       spin_unlock_irq(&nvmeq->q_lock);
-
-       /*
-        * The aborted req will be completed on receiving the abort req.
-        * We enable the timer again. If hit twice, it'll cause a device reset,
-        * as the device then is in a faulty state.
-        */
-       return BLK_EH_RESET_TIMER;
+               status |= NVME_SC_DNR;
+       blk_mq_complete_request(req, status);
 }
 
 static void nvme_free_queue(struct nvme_queue *nvmeq)
@@ -1430,8 +1048,8 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
        nvmeq->cq_vector = -1;
        spin_unlock_irq(&nvmeq->q_lock);
 
-       if (!nvmeq->qid && nvmeq->dev->admin_q)
-               blk_mq_freeze_queue_start(nvmeq->dev->admin_q);
+       if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
+               blk_mq_stop_hw_queues(nvmeq->dev->ctrl.admin_q);
 
        irq_set_affinity_hint(vector, NULL);
        free_irq(vector, nvmeq);
@@ -1447,21 +1065,20 @@ static void nvme_clear_queue(struct nvme_queue *nvmeq)
        spin_unlock_irq(&nvmeq->q_lock);
 }
 
-static void nvme_disable_queue(struct nvme_dev *dev, int qid)
+static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
 {
-       struct nvme_queue *nvmeq = dev->queues[qid];
+       struct nvme_queue *nvmeq = dev->queues[0];
 
        if (!nvmeq)
                return;
        if (nvme_suspend_queue(nvmeq))
                return;
 
-       /* Don't tell the adapter to delete the admin queue.
-        * Don't tell a removed adapter to delete IO queues. */
-       if (qid && readl(&dev->bar->csts) != -1) {
-               adapter_delete_sq(dev, qid);
-               adapter_delete_cq(dev, qid);
-       }
+       if (shutdown)
+               nvme_shutdown_ctrl(&dev->ctrl);
+       else
+               nvme_disable_ctrl(&dev->ctrl, lo_hi_readq(
+                                               dev->bar + NVME_REG_CAP));
 
        spin_lock_irq(&nvmeq->q_lock);
        nvme_process_cq(nvmeq);
@@ -1472,11 +1089,12 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
                                int entry_size)
 {
        int q_depth = dev->q_depth;
-       unsigned q_size_aligned = roundup(q_depth * entry_size, dev->page_size);
+       unsigned q_size_aligned = roundup(q_depth * entry_size,
+                                         dev->ctrl.page_size);
 
        if (q_size_aligned * nr_io_queues > dev->cmb_size) {
                u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues);
-               mem_per_q = round_down(mem_per_q, dev->page_size);
+               mem_per_q = round_down(mem_per_q, dev->ctrl.page_size);
                q_depth = div_u64(mem_per_q, entry_size);
 
                /*
@@ -1495,8 +1113,8 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
                                int qid, int depth)
 {
        if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) {
-               unsigned offset = (qid - 1) *
-                                       roundup(SQ_SIZE(depth), dev->page_size);
+               unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth),
+                                                     dev->ctrl.page_size);
                nvmeq->sq_dma_addr = dev->cmb_dma_addr + offset;
                nvmeq->sq_cmds_io = dev->cmb + offset;
        } else {
@@ -1527,7 +1145,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
        nvmeq->q_dmadev = dev->dev;
        nvmeq->dev = dev;
        snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d",
-                       dev->instance, qid);
+                       dev->ctrl.instance, qid);
        spin_lock_init(&nvmeq->q_lock);
        nvmeq->cq_head = 0;
        nvmeq->cq_phase = 1;
@@ -1604,79 +1222,9 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
        return result;
 }
 
-static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled)
-{
-       unsigned long timeout;
-       u32 bit = enabled ? NVME_CSTS_RDY : 0;
-
-       timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
-
-       while ((readl(&dev->bar->csts) & NVME_CSTS_RDY) != bit) {
-               msleep(100);
-               if (fatal_signal_pending(current))
-                       return -EINTR;
-               if (time_after(jiffies, timeout)) {
-                       dev_err(dev->dev,
-                               "Device not ready; aborting %s\n", enabled ?
-                                               "initialisation" : "reset");
-                       return -ENODEV;
-               }
-       }
-
-       return 0;
-}
-
-/*
- * If the device has been passed off to us in an enabled state, just clear
- * the enabled bit.  The spec says we should set the 'shutdown notification
- * bits', but doing so may cause the device to complete commands to the
- * admin queue ... and we don't know what memory that might be pointing at!
- */
-static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap)
-{
-       dev->ctrl_config &= ~NVME_CC_SHN_MASK;
-       dev->ctrl_config &= ~NVME_CC_ENABLE;
-       writel(dev->ctrl_config, &dev->bar->cc);
-
-       return nvme_wait_ready(dev, cap, false);
-}
-
-static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap)
-{
-       dev->ctrl_config &= ~NVME_CC_SHN_MASK;
-       dev->ctrl_config |= NVME_CC_ENABLE;
-       writel(dev->ctrl_config, &dev->bar->cc);
-
-       return nvme_wait_ready(dev, cap, true);
-}
-
-static int nvme_shutdown_ctrl(struct nvme_dev *dev)
-{
-       unsigned long timeout;
-
-       dev->ctrl_config &= ~NVME_CC_SHN_MASK;
-       dev->ctrl_config |= NVME_CC_SHN_NORMAL;
-
-       writel(dev->ctrl_config, &dev->bar->cc);
-
-       timeout = SHUTDOWN_TIMEOUT + jiffies;
-       while ((readl(&dev->bar->csts) & NVME_CSTS_SHST_MASK) !=
-                                                       NVME_CSTS_SHST_CMPLT) {
-               msleep(100);
-               if (fatal_signal_pending(current))
-                       return -EINTR;
-               if (time_after(jiffies, timeout)) {
-                       dev_err(dev->dev,
-                               "Device shutdown incomplete; abort shutdown\n");
-                       return -ENODEV;
-               }
-       }
-
-       return 0;
-}
-
 static struct blk_mq_ops nvme_mq_admin_ops = {
        .queue_rq       = nvme_queue_rq,
+       .complete       = nvme_complete_rq,
        .map_queue      = blk_mq_map_queue,
        .init_hctx      = nvme_admin_init_hctx,
        .exit_hctx      = nvme_admin_exit_hctx,
@@ -1686,6 +1234,7 @@ static struct blk_mq_ops nvme_mq_admin_ops = {
 
 static struct blk_mq_ops nvme_mq_ops = {
        .queue_rq       = nvme_queue_rq,
+       .complete       = nvme_complete_rq,
        .map_queue      = blk_mq_map_queue,
        .init_hctx      = nvme_init_hctx,
        .init_request   = nvme_init_request,
@@ -1695,19 +1244,23 @@ static struct blk_mq_ops nvme_mq_ops = {
 
 static void nvme_dev_remove_admin(struct nvme_dev *dev)
 {
-       if (dev->admin_q && !blk_queue_dying(dev->admin_q)) {
-               blk_cleanup_queue(dev->admin_q);
+       if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) {
+               blk_cleanup_queue(dev->ctrl.admin_q);
                blk_mq_free_tag_set(&dev->admin_tagset);
        }
 }
 
 static int nvme_alloc_admin_tags(struct nvme_dev *dev)
 {
-       if (!dev->admin_q) {
+       if (!dev->ctrl.admin_q) {
                dev->admin_tagset.ops = &nvme_mq_admin_ops;
                dev->admin_tagset.nr_hw_queues = 1;
-               dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1;
-               dev->admin_tagset.reserved_tags = 1;
+
+               /*
+                * Subtract one to leave an empty queue entry for 'Full Queue'
+                * condition. See NVM-Express 1.2 specification, section 4.1.2.
+                */
+               dev->admin_tagset.queue_depth = NVME_AQ_BLKMQ_DEPTH - 1;
                dev->admin_tagset.timeout = ADMIN_TIMEOUT;
                dev->admin_tagset.numa_node = dev_to_node(dev->dev);
                dev->admin_tagset.cmd_size = nvme_cmd_size(dev);
@@ -1716,18 +1269,18 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
                if (blk_mq_alloc_tag_set(&dev->admin_tagset))
                        return -ENOMEM;
 
-               dev->admin_q = blk_mq_init_queue(&dev->admin_tagset);
-               if (IS_ERR(dev->admin_q)) {
+               dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset);
+               if (IS_ERR(dev->ctrl.admin_q)) {
                        blk_mq_free_tag_set(&dev->admin_tagset);
                        return -ENOMEM;
                }
-               if (!blk_get_queue(dev->admin_q)) {
+               if (!blk_get_queue(dev->ctrl.admin_q)) {
                        nvme_dev_remove_admin(dev);
-                       dev->admin_q = NULL;
+                       dev->ctrl.admin_q = NULL;
                        return -ENODEV;
                }
        } else
-               blk_mq_unfreeze_queue(dev->admin_q);
+               blk_mq_start_stopped_hw_queues(dev->ctrl.admin_q, true);
 
        return 0;
 }
@@ -1736,31 +1289,17 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
 {
        int result;
        u32 aqa;
-       u64 cap = lo_hi_readq(&dev->bar->cap);
+       u64 cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
        struct nvme_queue *nvmeq;
-       /*
-        * default to a 4K page size, with the intention to update this
-        * path in the future to accomodate architectures with differing
-        * kernel and IO page sizes.
-        */
-       unsigned page_shift = 12;
-       unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12;
-
-       if (page_shift < dev_page_min) {
-               dev_err(dev->dev,
-                               "Minimum device page size (%u) too large for "
-                               "host (%u)\n", 1 << dev_page_min,
-                               1 << page_shift);
-               return -ENODEV;
-       }
 
-       dev->subsystem = readl(&dev->bar->vs) >= NVME_VS(1, 1) ?
+       dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1) ?
                                                NVME_CAP_NSSRC(cap) : 0;
 
-       if (dev->subsystem && (readl(&dev->bar->csts) & NVME_CSTS_NSSRO))
-               writel(NVME_CSTS_NSSRO, &dev->bar->csts);
+       if (dev->subsystem &&
+           (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
+               writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);
 
-       result = nvme_disable_ctrl(dev, cap);
+       result = nvme_disable_ctrl(&dev->ctrl, cap);
        if (result < 0)
                return result;
 
@@ -1774,18 +1313,11 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
        aqa = nvmeq->q_depth - 1;
        aqa |= aqa << 16;
 
-       dev->page_size = 1 << page_shift;
-
-       dev->ctrl_config = NVME_CC_CSS_NVM;
-       dev->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
-       dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
-       dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
-
-       writel(aqa, &dev->bar->aqa);
-       lo_hi_writeq(nvmeq->sq_dma_addr, &dev->bar->asq);
-       lo_hi_writeq(nvmeq->cq_dma_addr, &dev->bar->acq);
+       writel(aqa, dev->bar + NVME_REG_AQA);
+       lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ);
+       lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ);
 
-       result = nvme_enable_ctrl(dev, cap);
+       result = nvme_enable_ctrl(&dev->ctrl, cap);
        if (result)
                goto free_nvmeq;
 
@@ -1803,406 +1335,6 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
        return result;
 }
 
-static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
-{
-       struct nvme_dev *dev = ns->dev;
-       struct nvme_user_io io;
-       struct nvme_command c;
-       unsigned length, meta_len;
-       int status, write;
-       dma_addr_t meta_dma = 0;
-       void *meta = NULL;
-       void __user *metadata;
-
-       if (copy_from_user(&io, uio, sizeof(io)))
-               return -EFAULT;
-
-       switch (io.opcode) {
-       case nvme_cmd_write:
-       case nvme_cmd_read:
-       case nvme_cmd_compare:
-               break;
-       default:
-               return -EINVAL;
-       }
-
-       length = (io.nblocks + 1) << ns->lba_shift;
-       meta_len = (io.nblocks + 1) * ns->ms;
-       metadata = (void __user *)(uintptr_t)io.metadata;
-       write = io.opcode & 1;
-
-       if (ns->ext) {
-               length += meta_len;
-               meta_len = 0;
-       }
-       if (meta_len) {
-               if (((io.metadata & 3) || !io.metadata) && !ns->ext)
-                       return -EINVAL;
-
-               meta = dma_alloc_coherent(dev->dev, meta_len,
-                                               &meta_dma, GFP_KERNEL);
-
-               if (!meta) {
-                       status = -ENOMEM;
-                       goto unmap;
-               }
-               if (write) {
-                       if (copy_from_user(meta, metadata, meta_len)) {
-                               status = -EFAULT;
-                               goto unmap;
-                       }
-               }
-       }
-
-       memset(&c, 0, sizeof(c));
-       c.rw.opcode = io.opcode;
-       c.rw.flags = io.flags;
-       c.rw.nsid = cpu_to_le32(ns->ns_id);
-       c.rw.slba = cpu_to_le64(io.slba);
-       c.rw.length = cpu_to_le16(io.nblocks);
-       c.rw.control = cpu_to_le16(io.control);
-       c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
-       c.rw.reftag = cpu_to_le32(io.reftag);
-       c.rw.apptag = cpu_to_le16(io.apptag);
-       c.rw.appmask = cpu_to_le16(io.appmask);
-       c.rw.metadata = cpu_to_le64(meta_dma);
-
-       status = __nvme_submit_sync_cmd(ns->queue, &c, NULL,
-                       (void __user *)(uintptr_t)io.addr, length, NULL, 0);
- unmap:
-       if (meta) {
-               if (status == NVME_SC_SUCCESS && !write) {
-                       if (copy_to_user(metadata, meta, meta_len))
-                               status = -EFAULT;
-               }
-               dma_free_coherent(dev->dev, meta_len, meta, meta_dma);
-       }
-       return status;
-}
-
-static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns,
-                       struct nvme_passthru_cmd __user *ucmd)
-{
-       struct nvme_passthru_cmd cmd;
-       struct nvme_command c;
-       unsigned timeout = 0;
-       int status;
-
-       if (!capable(CAP_SYS_ADMIN))
-               return -EACCES;
-       if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
-               return -EFAULT;
-
-       memset(&c, 0, sizeof(c));
-       c.common.opcode = cmd.opcode;
-       c.common.flags = cmd.flags;
-       c.common.nsid = cpu_to_le32(cmd.nsid);
-       c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
-       c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
-       c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
-       c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
-       c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
-       c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
-       c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
-       c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
-
-       if (cmd.timeout_ms)
-               timeout = msecs_to_jiffies(cmd.timeout_ms);
-
-       status = __nvme_submit_sync_cmd(ns ? ns->queue : dev->admin_q, &c,
-                       NULL, (void __user *)(uintptr_t)cmd.addr, cmd.data_len,
-                       &cmd.result, timeout);
-       if (status >= 0) {
-               if (put_user(cmd.result, &ucmd->result))
-                       return -EFAULT;
-       }
-
-       return status;
-}
-
-static int nvme_subsys_reset(struct nvme_dev *dev)
-{
-       if (!dev->subsystem)
-               return -ENOTTY;
-
-       writel(0x4E564D65, &dev->bar->nssr); /* "NVMe" */
-       return 0;
-}
-
-static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
-                                                       unsigned long arg)
-{
-       struct nvme_ns *ns = bdev->bd_disk->private_data;
-
-       switch (cmd) {
-       case NVME_IOCTL_ID:
-               force_successful_syscall_return();
-               return ns->ns_id;
-       case NVME_IOCTL_ADMIN_CMD:
-               return nvme_user_cmd(ns->dev, NULL, (void __user *)arg);
-       case NVME_IOCTL_IO_CMD:
-               return nvme_user_cmd(ns->dev, ns, (void __user *)arg);
-       case NVME_IOCTL_SUBMIT_IO:
-               return nvme_submit_io(ns, (void __user *)arg);
-       case SG_GET_VERSION_NUM:
-               return nvme_sg_get_version_num((void __user *)arg);
-       case SG_IO:
-               return nvme_sg_io(ns, (void __user *)arg);
-       default:
-               return -ENOTTY;
-       }
-}
-
-#ifdef CONFIG_COMPAT
-static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
-                                       unsigned int cmd, unsigned long arg)
-{
-       switch (cmd) {
-       case SG_IO:
-               return -ENOIOCTLCMD;
-       }
-       return nvme_ioctl(bdev, mode, cmd, arg);
-}
-#else
-#define nvme_compat_ioctl      NULL
-#endif
-
-static void nvme_free_dev(struct kref *kref);
-static void nvme_free_ns(struct kref *kref)
-{
-       struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
-
-       if (ns->type == NVME_NS_LIGHTNVM)
-               nvme_nvm_unregister(ns->queue, ns->disk->disk_name);
-
-       spin_lock(&dev_list_lock);
-       ns->disk->private_data = NULL;
-       spin_unlock(&dev_list_lock);
-
-       kref_put(&ns->dev->kref, nvme_free_dev);
-       put_disk(ns->disk);
-       kfree(ns);
-}
-
-static int nvme_open(struct block_device *bdev, fmode_t mode)
-{
-       int ret = 0;
-       struct nvme_ns *ns;
-
-       spin_lock(&dev_list_lock);
-       ns = bdev->bd_disk->private_data;
-       if (!ns)
-               ret = -ENXIO;
-       else if (!kref_get_unless_zero(&ns->kref))
-               ret = -ENXIO;
-       spin_unlock(&dev_list_lock);
-
-       return ret;
-}
-
-static void nvme_release(struct gendisk *disk, fmode_t mode)
-{
-       struct nvme_ns *ns = disk->private_data;
-       kref_put(&ns->kref, nvme_free_ns);
-}
-
-static int nvme_getgeo(struct block_device *bd, struct hd_geometry *geo)
-{
-       /* some standard values */
-       geo->heads = 1 << 6;
-       geo->sectors = 1 << 5;
-       geo->cylinders = get_capacity(bd->bd_disk) >> 11;
-       return 0;
-}
-
-static void nvme_config_discard(struct nvme_ns *ns)
-{
-       u32 logical_block_size = queue_logical_block_size(ns->queue);
-       ns->queue->limits.discard_zeroes_data = 0;
-       ns->queue->limits.discard_alignment = logical_block_size;
-       ns->queue->limits.discard_granularity = logical_block_size;
-       blk_queue_max_discard_sectors(ns->queue, 0xffffffff);
-       queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
-}
-
-static int nvme_revalidate_disk(struct gendisk *disk)
-{
-       struct nvme_ns *ns = disk->private_data;
-       struct nvme_dev *dev = ns->dev;
-       struct nvme_id_ns *id;
-       u8 lbaf, pi_type;
-       u16 old_ms;
-       unsigned short bs;
-
-       if (nvme_identify_ns(dev, ns->ns_id, &id)) {
-               dev_warn(dev->dev, "%s: Identify failure nvme%dn%d\n", __func__,
-                                               dev->instance, ns->ns_id);
-               return -ENODEV;
-       }
-       if (id->ncap == 0) {
-               kfree(id);
-               return -ENODEV;
-       }
-
-       if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) {
-               if (nvme_nvm_register(ns->queue, disk->disk_name)) {
-                       dev_warn(dev->dev,
-                               "%s: LightNVM init failure\n", __func__);
-                       kfree(id);
-                       return -ENODEV;
-               }
-               ns->type = NVME_NS_LIGHTNVM;
-       }
-
-       old_ms = ns->ms;
-       lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
-       ns->lba_shift = id->lbaf[lbaf].ds;
-       ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
-       ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
-
-       /*
-        * If identify namespace failed, use default 512 byte block size so
-        * block layer can use before failing read/write for 0 capacity.
-        */
-       if (ns->lba_shift == 0)
-               ns->lba_shift = 9;
-       bs = 1 << ns->lba_shift;
-
-       /* XXX: PI implementation requires metadata equal t10 pi tuple size */
-       pi_type = ns->ms == sizeof(struct t10_pi_tuple) ?
-                                       id->dps & NVME_NS_DPS_PI_MASK : 0;
-
-       blk_mq_freeze_queue(disk->queue);
-       if (blk_get_integrity(disk) && (ns->pi_type != pi_type ||
-                               ns->ms != old_ms ||
-                               bs != queue_logical_block_size(disk->queue) ||
-                               (ns->ms && ns->ext)))
-               blk_integrity_unregister(disk);
-
-       ns->pi_type = pi_type;
-       blk_queue_logical_block_size(ns->queue, bs);
-
-       if (ns->ms && !ns->ext)
-               nvme_init_integrity(ns);
-
-       if ((ns->ms && !(ns->ms == 8 && ns->pi_type) &&
-                                               !blk_get_integrity(disk)) ||
-                                               ns->type == NVME_NS_LIGHTNVM)
-               set_capacity(disk, 0);
-       else
-               set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
-
-       if (dev->oncs & NVME_CTRL_ONCS_DSM)
-               nvme_config_discard(ns);
-       blk_mq_unfreeze_queue(disk->queue);
-
-       kfree(id);
-       return 0;
-}
-
-static char nvme_pr_type(enum pr_type type)
-{
-       switch (type) {
-       case PR_WRITE_EXCLUSIVE:
-               return 1;
-       case PR_EXCLUSIVE_ACCESS:
-               return 2;
-       case PR_WRITE_EXCLUSIVE_REG_ONLY:
-               return 3;
-       case PR_EXCLUSIVE_ACCESS_REG_ONLY:
-               return 4;
-       case PR_WRITE_EXCLUSIVE_ALL_REGS:
-               return 5;
-       case PR_EXCLUSIVE_ACCESS_ALL_REGS:
-               return 6;
-       default:
-               return 0;
-       }
-};
-
-static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
-                               u64 key, u64 sa_key, u8 op)
-{
-       struct nvme_ns *ns = bdev->bd_disk->private_data;
-       struct nvme_command c;
-       u8 data[16] = { 0, };
-
-       put_unaligned_le64(key, &data[0]);
-       put_unaligned_le64(sa_key, &data[8]);
-
-       memset(&c, 0, sizeof(c));
-       c.common.opcode = op;
-       c.common.nsid = cpu_to_le32(ns->ns_id);
-       c.common.cdw10[0] = cpu_to_le32(cdw10);
-
-       return nvme_submit_sync_cmd(ns->queue, &c, data, 16);
-}
-
-static int nvme_pr_register(struct block_device *bdev, u64 old,
-               u64 new, unsigned flags)
-{
-       u32 cdw10;
-
-       if (flags & ~PR_FL_IGNORE_KEY)
-               return -EOPNOTSUPP;
-
-       cdw10 = old ? 2 : 0;
-       cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
-       cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
-       return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
-}
-
-static int nvme_pr_reserve(struct block_device *bdev, u64 key,
-               enum pr_type type, unsigned flags)
-{
-       u32 cdw10;
-
-       if (flags & ~PR_FL_IGNORE_KEY)
-               return -EOPNOTSUPP;
-
-       cdw10 = nvme_pr_type(type) << 8;
-       cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
-       return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
-}
-
-static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
-               enum pr_type type, bool abort)
-{
-       u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1;
-       return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
-}
-
-static int nvme_pr_clear(struct block_device *bdev, u64 key)
-{
-       u32 cdw10 = 1 | (key ? 1 << 3 : 0);
-       return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
-}
-
-static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
-{
-       u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0;
-       return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
-}
-
-static const struct pr_ops nvme_pr_ops = {
-       .pr_register    = nvme_pr_register,
-       .pr_reserve     = nvme_pr_reserve,
-       .pr_release     = nvme_pr_release,
-       .pr_preempt     = nvme_pr_preempt,
-       .pr_clear       = nvme_pr_clear,
-};
-
-static const struct block_device_operations nvme_fops = {
-       .owner          = THIS_MODULE,
-       .ioctl          = nvme_ioctl,
-       .compat_ioctl   = nvme_compat_ioctl,
-       .open           = nvme_open,
-       .release        = nvme_release,
-       .getgeo         = nvme_getgeo,
-       .revalidate_disk= nvme_revalidate_disk,
-       .pr_ops         = &nvme_pr_ops,
-};
-
 static int nvme_kthread(void *data)
 {
        struct nvme_dev *dev, *next;
@@ -2212,14 +1344,20 @@ static int nvme_kthread(void *data)
                spin_lock(&dev_list_lock);
                list_for_each_entry_safe(dev, next, &dev_list, node) {
                        int i;
-                       u32 csts = readl(&dev->bar->csts);
+                       u32 csts = readl(dev->bar + NVME_REG_CSTS);
+
+                       /*
+                        * Skip controllers currently under reset.
+                        */
+                       if (work_pending(&dev->reset_work) || work_busy(&dev->reset_work))
+                               continue;
 
                        if ((dev->subsystem && (csts & NVME_CSTS_NSSRO)) ||
                                                        csts & NVME_CSTS_CFS) {
-                               if (!__nvme_reset(dev)) {
+                               if (queue_work(nvme_workq, &dev->reset_work)) {
                                        dev_warn(dev->dev,
                                                "Failed status: %x, reset controller\n",
-                                               readl(&dev->bar->csts));
+                                               readl(dev->bar + NVME_REG_CSTS));
                                }
                                continue;
                        }
@@ -2230,11 +1368,8 @@ static int nvme_kthread(void *data)
                                spin_lock_irq(&nvmeq->q_lock);
                                nvme_process_cq(nvmeq);
 
-                               while ((i == 0) && (dev->event_limit > 0)) {
-                                       if (nvme_submit_async_admin_req(dev))
-                                               break;
-                                       dev->event_limit--;
-                               }
+                               while (i == 0 && dev->ctrl.event_limit > 0)
+                                       nvme_submit_async_event(dev);
                                spin_unlock_irq(&nvmeq->q_lock);
                        }
                }
@@ -2244,127 +1379,33 @@ static int nvme_kthread(void *data)
        return 0;
 }
 
-static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid)
-{
-       struct nvme_ns *ns;
-       struct gendisk *disk;
-       int node = dev_to_node(dev->dev);
-
-       ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
-       if (!ns)
-               return;
-
-       ns->queue = blk_mq_init_queue(&dev->tagset);
-       if (IS_ERR(ns->queue))
-               goto out_free_ns;
-       queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
-       queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
-       ns->dev = dev;
-       ns->queue->queuedata = ns;
-
-       disk = alloc_disk_node(0, node);
-       if (!disk)
-               goto out_free_queue;
-
-       kref_init(&ns->kref);
-       ns->ns_id = nsid;
-       ns->disk = disk;
-       ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
-       list_add_tail(&ns->list, &dev->namespaces);
-
-       blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
-       if (dev->max_hw_sectors) {
-               blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
-               blk_queue_max_segments(ns->queue,
-                       (dev->max_hw_sectors / (dev->page_size >> 9)) + 1);
-       }
-       if (dev->stripe_size)
-               blk_queue_chunk_sectors(ns->queue, dev->stripe_size >> 9);
-       if (dev->vwc & NVME_CTRL_VWC_PRESENT)
-               blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA);
-       blk_queue_virt_boundary(ns->queue, dev->page_size - 1);
-
-       disk->major = nvme_major;
-       disk->first_minor = 0;
-       disk->fops = &nvme_fops;
-       disk->private_data = ns;
-       disk->queue = ns->queue;
-       disk->driverfs_dev = dev->device;
-       disk->flags = GENHD_FL_EXT_DEVT;
-       sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid);
-
-       /*
-        * Initialize capacity to 0 until we establish the namespace format and
-        * setup integrity extentions if necessary. The revalidate_disk after
-        * add_disk allows the driver to register with integrity if the format
-        * requires it.
-        */
-       set_capacity(disk, 0);
-       if (nvme_revalidate_disk(ns->disk))
-               goto out_free_disk;
-
-       kref_get(&dev->kref);
-       if (ns->type != NVME_NS_LIGHTNVM) {
-               add_disk(ns->disk);
-               if (ns->ms) {
-                       struct block_device *bd = bdget_disk(ns->disk, 0);
-                       if (!bd)
-                               return;
-                       if (blkdev_get(bd, FMODE_READ, NULL)) {
-                               bdput(bd);
-                               return;
-                       }
-                       blkdev_reread_part(bd);
-                       blkdev_put(bd, FMODE_READ);
-               }
-       }
-       return;
- out_free_disk:
-       kfree(disk);
-       list_del(&ns->list);
- out_free_queue:
-       blk_cleanup_queue(ns->queue);
- out_free_ns:
-       kfree(ns);
-}
-
-/*
- * Create I/O queues.  Failing to create an I/O queue is not an issue,
- * we can continue with less than the desired amount of queues, and
- * even a controller without I/O queues an still be used to issue
- * admin commands.  This might be useful to upgrade a buggy firmware
- * for example.
- */
-static void nvme_create_io_queues(struct nvme_dev *dev)
+static int nvme_create_io_queues(struct nvme_dev *dev)
 {
        unsigned i;
+       int ret = 0;
 
-       for (i = dev->queue_count; i <= dev->max_qid; i++)
-               if (!nvme_alloc_queue(dev, i, dev->q_depth))
+       for (i = dev->queue_count; i <= dev->max_qid; i++) {
+               if (!nvme_alloc_queue(dev, i, dev->q_depth)) {
+                       ret = -ENOMEM;
                        break;
+               }
+       }
 
-       for (i = dev->online_queues; i <= dev->queue_count - 1; i++)
-               if (nvme_create_queue(dev->queues[i], i)) {
+       for (i = dev->online_queues; i <= dev->queue_count - 1; i++) {
+               ret = nvme_create_queue(dev->queues[i], i);
+               if (ret) {
                        nvme_free_queues(dev, i);
                        break;
                }
-}
-
-static int set_queue_count(struct nvme_dev *dev, int count)
-{
-       int status;
-       u32 result;
-       u32 q_count = (count - 1) | ((count - 1) << 16);
-
-       status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0,
-                                                               &result);
-       if (status < 0)
-               return status;
-       if (status > 0) {
-               dev_err(dev->dev, "Could not set queue count (%d)\n", status);
-               return 0;
        }
-       return min(result & 0xffff, result >> 16) + 1;
+
+       /*
+        * Ignore failing Create SQ/CQ commands, we can continue with less
+        * than the desired aount of queues, and even a controller without
+        * I/O queues an still be used to issue admin commands.  This might
+        * be useful to upgrade a buggy firmware for example.
+        */
+       return ret >= 0 ? 0 : ret;
 }
 
 static void __iomem *nvme_map_cmb(struct nvme_dev *dev)
@@ -2379,11 +1420,11 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev)
        if (!use_cmb_sqes)
                return NULL;
 
-       dev->cmbsz = readl(&dev->bar->cmbsz);
+       dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
        if (!(NVME_CMB_SZ(dev->cmbsz)))
                return NULL;
 
-       cmbloc = readl(&dev->bar->cmbloc);
+       cmbloc = readl(dev->bar + NVME_REG_CMBLOC);
 
        szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz));
        size = szu * NVME_CMB_SZ(dev->cmbsz);
@@ -2431,11 +1472,20 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
        int result, i, vecs, nr_io_queues, size;
 
        nr_io_queues = num_possible_cpus();
-       result = set_queue_count(dev, nr_io_queues);
-       if (result <= 0)
+       result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
+       if (result < 0)
                return result;
-       if (result < nr_io_queues)
-               nr_io_queues = result;
+
+       /*
+        * Degraded controllers might return an error when setting the queue
+        * count.  We still want to be able to bring them online and offer
+        * access to the admin queue, as that might be only way to fix them up.
+        */
+       if (result > 0) {
+               dev_err(dev->dev, "Could not set queue count (%d)\n", result);
+               nr_io_queues = 0;
+               result = 0;
+       }
 
        if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) {
                result = nvme_cmb_qdepth(dev, nr_io_queues,
@@ -2457,7 +1507,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
                                return -ENOMEM;
                        size = db_bar_size(dev, nr_io_queues);
                } while (1);
-               dev->dbs = ((void __iomem *)dev->bar) + 4096;
+               dev->dbs = dev->bar + 4096;
                adminq->q_db = dev->dbs;
        }
 
@@ -2501,115 +1551,115 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 
        /* Free previously allocated queues that are no longer usable */
        nvme_free_queues(dev, nr_io_queues + 1);
-       nvme_create_io_queues(dev);
-
-       return 0;
+       return nvme_create_io_queues(dev);
 
  free_queues:
        nvme_free_queues(dev, 1);
        return result;
 }
 
-static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
+static void nvme_set_irq_hints(struct nvme_dev *dev)
 {
-       struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
-       struct nvme_ns *nsb = container_of(b, struct nvme_ns, list);
+       struct nvme_queue *nvmeq;
+       int i;
 
-       return nsa->ns_id - nsb->ns_id;
-}
+       for (i = 0; i < dev->online_queues; i++) {
+               nvmeq = dev->queues[i];
 
-static struct nvme_ns *nvme_find_ns(struct nvme_dev *dev, unsigned nsid)
-{
-       struct nvme_ns *ns;
+               if (!nvmeq->tags || !(*nvmeq->tags))
+                       continue;
 
-       list_for_each_entry(ns, &dev->namespaces, list) {
-               if (ns->ns_id == nsid)
-                       return ns;
-               if (ns->ns_id > nsid)
-                       break;
+               irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector,
+                                       blk_mq_tags_cpumask(*nvmeq->tags));
        }
-       return NULL;
 }
 
-static inline bool nvme_io_incapable(struct nvme_dev *dev)
+static void nvme_dev_scan(struct work_struct *work)
 {
-       return (!dev->bar || readl(&dev->bar->csts) & NVME_CSTS_CFS ||
-                                                       dev->online_queues < 2);
+       struct nvme_dev *dev = container_of(work, struct nvme_dev, scan_work);
+
+       if (!dev->tagset.tags)
+               return;
+       nvme_scan_namespaces(&dev->ctrl);
+       nvme_set_irq_hints(dev);
 }
 
-static void nvme_ns_remove(struct nvme_ns *ns)
+static void nvme_del_queue_end(struct request *req, int error)
 {
-       bool kill = nvme_io_incapable(ns->dev) && !blk_queue_dying(ns->queue);
-
-       if (kill) {
-               blk_set_queue_dying(ns->queue);
+       struct nvme_queue *nvmeq = req->end_io_data;
 
-               /*
-                * The controller was shutdown first if we got here through
-                * device removal. The shutdown may requeue outstanding
-                * requests. These need to be aborted immediately so
-                * del_gendisk doesn't block indefinitely for their completion.
-                */
-               blk_mq_abort_requeue_list(ns->queue);
-       }
-       if (ns->disk->flags & GENHD_FL_UP)
-               del_gendisk(ns->disk);
-       if (kill || !blk_queue_dying(ns->queue)) {
-               blk_mq_abort_requeue_list(ns->queue);
-               blk_cleanup_queue(ns->queue);
-       }
-       list_del_init(&ns->list);
-       kref_put(&ns->kref, nvme_free_ns);
+       blk_mq_free_request(req);
+       complete(&nvmeq->dev->ioq_wait);
 }
 
-static void nvme_scan_namespaces(struct nvme_dev *dev, unsigned nn)
+static void nvme_del_cq_end(struct request *req, int error)
 {
-       struct nvme_ns *ns, *next;
-       unsigned i;
+       struct nvme_queue *nvmeq = req->end_io_data;
 
-       for (i = 1; i <= nn; i++) {
-               ns = nvme_find_ns(dev, i);
-               if (ns) {
-                       if (revalidate_disk(ns->disk))
-                               nvme_ns_remove(ns);
-               } else
-                       nvme_alloc_ns(dev, i);
-       }
-       list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
-               if (ns->ns_id > nn)
-                       nvme_ns_remove(ns);
+       if (!error) {
+               unsigned long flags;
+
+               spin_lock_irqsave(&nvmeq->q_lock, flags);
+               nvme_process_cq(nvmeq);
+               spin_unlock_irqrestore(&nvmeq->q_lock, flags);
        }
-       list_sort(NULL, &dev->namespaces, ns_cmp);
+
+       nvme_del_queue_end(req, error);
 }
 
-static void nvme_set_irq_hints(struct nvme_dev *dev)
+static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
 {
-       struct nvme_queue *nvmeq;
-       int i;
+       struct request_queue *q = nvmeq->dev->ctrl.admin_q;
+       struct request *req;
+       struct nvme_command cmd;
 
-       for (i = 0; i < dev->online_queues; i++) {
-               nvmeq = dev->queues[i];
+       memset(&cmd, 0, sizeof(cmd));
+       cmd.delete_queue.opcode = opcode;
+       cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid);
 
-               if (!nvmeq->tags || !(*nvmeq->tags))
-                       continue;
+       req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT);
+       if (IS_ERR(req))
+               return PTR_ERR(req);
 
-               irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector,
-                                       blk_mq_tags_cpumask(*nvmeq->tags));
-       }
+       req->timeout = ADMIN_TIMEOUT;
+       req->end_io_data = nvmeq;
+
+       blk_execute_rq_nowait(q, NULL, req, false,
+                       opcode == nvme_admin_delete_cq ?
+                               nvme_del_cq_end : nvme_del_queue_end);
+       return 0;
 }
 
-static void nvme_dev_scan(struct work_struct *work)
+static void nvme_disable_io_queues(struct nvme_dev *dev)
 {
-       struct nvme_dev *dev = container_of(work, struct nvme_dev, scan_work);
-       struct nvme_id_ctrl *ctrl;
+       int pass;
+       unsigned long timeout;
+       u8 opcode = nvme_admin_delete_sq;
 
-       if (!dev->tagset.tags)
-               return;
-       if (nvme_identify_ctrl(dev, &ctrl))
-               return;
-       nvme_scan_namespaces(dev, le32_to_cpup(&ctrl->nn));
-       kfree(ctrl);
-       nvme_set_irq_hints(dev);
+       for (pass = 0; pass < 2; pass++) {
+               int sent = 0, i = dev->queue_count - 1;
+
+               reinit_completion(&dev->ioq_wait);
+ retry:
+               timeout = ADMIN_TIMEOUT;
+               for (; i > 0; i--) {
+                       struct nvme_queue *nvmeq = dev->queues[i];
+
+                       if (!pass)
+                               nvme_suspend_queue(nvmeq);
+                       if (nvme_delete_queue(nvmeq, opcode))
+                               break;
+                       ++sent;
+               }
+               while (sent--) {
+                       timeout = wait_for_completion_io_timeout(&dev->ioq_wait, timeout);
+                       if (timeout == 0)
+                               return;
+                       if (i)
+                               goto retry;
+               }
+               opcode = nvme_admin_delete_cq;
+       }
 }
 
 /*
@@ -2620,42 +1670,7 @@ static void nvme_dev_scan(struct work_struct *work)
  */
 static int nvme_dev_add(struct nvme_dev *dev)
 {
-       struct pci_dev *pdev = to_pci_dev(dev->dev);
-       int res;
-       struct nvme_id_ctrl *ctrl;
-       int shift = NVME_CAP_MPSMIN(lo_hi_readq(&dev->bar->cap)) + 12;
-
-       res = nvme_identify_ctrl(dev, &ctrl);
-       if (res) {
-               dev_err(dev->dev, "Identify Controller failed (%d)\n", res);
-               return -EIO;
-       }
-
-       dev->oncs = le16_to_cpup(&ctrl->oncs);
-       dev->abort_limit = ctrl->acl + 1;
-       dev->vwc = ctrl->vwc;
-       memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn));
-       memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn));
-       memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr));
-       if (ctrl->mdts)
-               dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9);
-       else
-               dev->max_hw_sectors = UINT_MAX;
-       if ((pdev->vendor == PCI_VENDOR_ID_INTEL) &&
-                       (pdev->device == 0x0953) && ctrl->vs[3]) {
-               unsigned int max_hw_sectors;
-
-               dev->stripe_size = 1 << (ctrl->vs[3] + shift);
-               max_hw_sectors = dev->stripe_size >> (shift - 9);
-               if (dev->max_hw_sectors) {
-                       dev->max_hw_sectors = min(max_hw_sectors,
-                                                       dev->max_hw_sectors);
-               } else
-                       dev->max_hw_sectors = max_hw_sectors;
-       }
-       kfree(ctrl);
-
-       if (!dev->tagset.tags) {
+       if (!dev->ctrl.tagset) {
                dev->tagset.ops = &nvme_mq_ops;
                dev->tagset.nr_hw_queues = dev->online_queues - 1;
                dev->tagset.timeout = NVME_IO_TIMEOUT;
@@ -2668,8 +1683,9 @@ static int nvme_dev_add(struct nvme_dev *dev)
 
                if (blk_mq_alloc_tag_set(&dev->tagset))
                        return 0;
+               dev->ctrl.tagset = &dev->tagset;
        }
-       schedule_work(&dev->scan_work);
+       queue_work(nvme_workq, &dev->scan_work);
        return 0;
 }
 
@@ -2699,7 +1715,7 @@ static int nvme_dev_map(struct nvme_dev *dev)
        if (!dev->bar)
                goto disable;
 
-       if (readl(&dev->bar->csts) == -1) {
+       if (readl(dev->bar + NVME_REG_CSTS) == -1) {
                result = -ENODEV;
                goto unmap;
        }
@@ -2714,10 +1730,11 @@ static int nvme_dev_map(struct nvme_dev *dev)
                        goto unmap;
        }
 
-       cap = lo_hi_readq(&dev->bar->cap);
+       cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
+
        dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH);
        dev->db_stride = 1 << NVME_CAP_STRIDE(cap);
-       dev->dbs = ((void __iomem *)dev->bar) + 4096;
+       dev->dbs = dev->bar + 4096;
 
        /*
         * Temporary fix for the Apple controller found in the MacBook8,1 and
@@ -2730,9 +1747,11 @@ static int nvme_dev_map(struct nvme_dev *dev)
                        dev->q_depth);
        }
 
-       if (readl(&dev->bar->vs) >= NVME_VS(1, 2))
+       if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 2))
                dev->cmb = nvme_map_cmb(dev);
 
+       pci_enable_pcie_error_reporting(pdev);
+       pci_save_state(pdev);
        return 0;
 
  unmap:
@@ -2760,152 +1779,34 @@ static void nvme_dev_unmap(struct nvme_dev *dev)
                pci_release_regions(pdev);
        }
 
-       if (pci_is_enabled(pdev))
+       if (pci_is_enabled(pdev)) {
+               pci_disable_pcie_error_reporting(pdev);
                pci_disable_device(pdev);
-}
-
-struct nvme_delq_ctx {
-       struct task_struct *waiter;
-       struct kthread_worker *worker;
-       atomic_t refcount;
-};
-
-static void nvme_wait_dq(struct nvme_delq_ctx *dq, struct nvme_dev *dev)
-{
-       dq->waiter = current;
-       mb();
-
-       for (;;) {
-               set_current_state(TASK_KILLABLE);
-               if (!atomic_read(&dq->refcount))
-                       break;
-               if (!schedule_timeout(ADMIN_TIMEOUT) ||
-                                       fatal_signal_pending(current)) {
-                       /*
-                        * Disable the controller first since we can't trust it
-                        * at this point, but leave the admin queue enabled
-                        * until all queue deletion requests are flushed.
-                        * FIXME: This may take a while if there are more h/w
-                        * queues than admin tags.
-                        */
-                       set_current_state(TASK_RUNNING);
-                       nvme_disable_ctrl(dev, lo_hi_readq(&dev->bar->cap));
-                       nvme_clear_queue(dev->queues[0]);
-                       flush_kthread_worker(dq->worker);
-                       nvme_disable_queue(dev, 0);
-                       return;
-               }
        }
-       set_current_state(TASK_RUNNING);
-}
-
-static void nvme_put_dq(struct nvme_delq_ctx *dq)
-{
-       atomic_dec(&dq->refcount);
-       if (dq->waiter)
-               wake_up_process(dq->waiter);
-}
-
-static struct nvme_delq_ctx *nvme_get_dq(struct nvme_delq_ctx *dq)
-{
-       atomic_inc(&dq->refcount);
-       return dq;
-}
-
-static void nvme_del_queue_end(struct nvme_queue *nvmeq)
-{
-       struct nvme_delq_ctx *dq = nvmeq->cmdinfo.ctx;
-       nvme_put_dq(dq);
-
-       spin_lock_irq(&nvmeq->q_lock);
-       nvme_process_cq(nvmeq);
-       spin_unlock_irq(&nvmeq->q_lock);
-}
-
-static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode,
-                                               kthread_work_func_t fn)
-{
-       struct nvme_command c;
-
-       memset(&c, 0, sizeof(c));
-       c.delete_queue.opcode = opcode;
-       c.delete_queue.qid = cpu_to_le16(nvmeq->qid);
-
-       init_kthread_work(&nvmeq->cmdinfo.work, fn);
-       return nvme_submit_admin_async_cmd(nvmeq->dev, &c, &nvmeq->cmdinfo,
-                                                               ADMIN_TIMEOUT);
-}
-
-static void nvme_del_cq_work_handler(struct kthread_work *work)
-{
-       struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
-                                                       cmdinfo.work);
-       nvme_del_queue_end(nvmeq);
-}
-
-static int nvme_delete_cq(struct nvme_queue *nvmeq)
-{
-       return adapter_async_del_queue(nvmeq, nvme_admin_delete_cq,
-                                               nvme_del_cq_work_handler);
 }
 
-static void nvme_del_sq_work_handler(struct kthread_work *work)
+static int nvme_dev_list_add(struct nvme_dev *dev)
 {
-       struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
-                                                       cmdinfo.work);
-       int status = nvmeq->cmdinfo.status;
-
-       if (!status)
-               status = nvme_delete_cq(nvmeq);
-       if (status)
-               nvme_del_queue_end(nvmeq);
-}
-
-static int nvme_delete_sq(struct nvme_queue *nvmeq)
-{
-       return adapter_async_del_queue(nvmeq, nvme_admin_delete_sq,
-                                               nvme_del_sq_work_handler);
-}
-
-static void nvme_del_queue_start(struct kthread_work *work)
-{
-       struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
-                                                       cmdinfo.work);
-       if (nvme_delete_sq(nvmeq))
-               nvme_del_queue_end(nvmeq);
-}
+       bool start_thread = false;
 
-static void nvme_disable_io_queues(struct nvme_dev *dev)
-{
-       int i;
-       DEFINE_KTHREAD_WORKER_ONSTACK(worker);
-       struct nvme_delq_ctx dq;
-       struct task_struct *kworker_task = kthread_run(kthread_worker_fn,
-                                       &worker, "nvme%d", dev->instance);
-
-       if (IS_ERR(kworker_task)) {
-               dev_err(dev->dev,
-                       "Failed to create queue del task\n");
-               for (i = dev->queue_count - 1; i > 0; i--)
-                       nvme_disable_queue(dev, i);
-               return;
+       spin_lock(&dev_list_lock);
+       if (list_empty(&dev_list) && IS_ERR_OR_NULL(nvme_thread)) {
+               start_thread = true;
+               nvme_thread = NULL;
        }
+       list_add(&dev->node, &dev_list);
+       spin_unlock(&dev_list_lock);
 
-       dq.waiter = NULL;
-       atomic_set(&dq.refcount, 0);
-       dq.worker = &worker;
-       for (i = dev->queue_count - 1; i > 0; i--) {
-               struct nvme_queue *nvmeq = dev->queues[i];
+       if (start_thread) {
+               nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");
+               wake_up_all(&nvme_kthread_wait);
+       } else
+               wait_event_killable(nvme_kthread_wait, nvme_thread);
 
-               if (nvme_suspend_queue(nvmeq))
-                       continue;
-               nvmeq->cmdinfo.ctx = nvme_get_dq(&dq);
-               nvmeq->cmdinfo.worker = dq.worker;
-               init_kthread_work(&nvmeq->cmdinfo.work, nvme_del_queue_start);
-               queue_kthread_work(dq.worker, &nvmeq->cmdinfo.work);
-       }
-       nvme_wait_dq(&dq, dev);
-       kthread_stop(kworker_task);
+       if (IS_ERR_OR_NULL(nvme_thread))
+               return nvme_thread ? PTR_ERR(nvme_thread) : -EINTR;
+
+       return 0;
 }
 
 /*
@@ -2928,44 +1829,17 @@ static void nvme_dev_list_remove(struct nvme_dev *dev)
                kthread_stop(tmp);
 }
 
-static void nvme_freeze_queues(struct nvme_dev *dev)
-{
-       struct nvme_ns *ns;
-
-       list_for_each_entry(ns, &dev->namespaces, list) {
-               blk_mq_freeze_queue_start(ns->queue);
-
-               spin_lock_irq(ns->queue->queue_lock);
-               queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue);
-               spin_unlock_irq(ns->queue->queue_lock);
-
-               blk_mq_cancel_requeue_work(ns->queue);
-               blk_mq_stop_hw_queues(ns->queue);
-       }
-}
-
-static void nvme_unfreeze_queues(struct nvme_dev *dev)
-{
-       struct nvme_ns *ns;
-
-       list_for_each_entry(ns, &dev->namespaces, list) {
-               queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue);
-               blk_mq_unfreeze_queue(ns->queue);
-               blk_mq_start_stopped_hw_queues(ns->queue, true);
-               blk_mq_kick_requeue_list(ns->queue);
-       }
-}
-
-static void nvme_dev_shutdown(struct nvme_dev *dev)
+static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 {
        int i;
        u32 csts = -1;
 
        nvme_dev_list_remove(dev);
 
+       mutex_lock(&dev->shutdown_lock);
        if (dev->bar) {
-               nvme_freeze_queues(dev);
-               csts = readl(&dev->bar->csts);
+               nvme_stop_queues(&dev->ctrl);
+               csts = readl(dev->bar + NVME_REG_CSTS);
        }
        if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) {
                for (i = dev->queue_count - 1; i >= 0; i--) {
@@ -2974,30 +1848,13 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
                }
        } else {
                nvme_disable_io_queues(dev);
-               nvme_shutdown_ctrl(dev);
-               nvme_disable_queue(dev, 0);
+               nvme_disable_admin_queue(dev, shutdown);
        }
        nvme_dev_unmap(dev);
 
        for (i = dev->queue_count - 1; i >= 0; i--)
                nvme_clear_queue(dev->queues[i]);
-}
-
-static void nvme_dev_remove(struct nvme_dev *dev)
-{
-       struct nvme_ns *ns, *next;
-
-       if (nvme_io_incapable(dev)) {
-               /*
-                * If the device is not capable of IO (surprise hot-removal,
-                * for example), we need to quiesce prior to deleting the
-                * namespaces. This will end outstanding requests and prevent
-                * attempts to sync dirty data.
-                */
-               nvme_dev_shutdown(dev);
-       }
-       list_for_each_entry_safe(ns, next, &dev->namespaces, list)
-               nvme_ns_remove(ns);
+       mutex_unlock(&dev->shutdown_lock);
 }
 
 static int nvme_setup_prp_pools(struct nvme_dev *dev)
@@ -3023,119 +1880,36 @@ static void nvme_release_prp_pools(struct nvme_dev *dev)
        dma_pool_destroy(dev->prp_small_pool);
 }
 
-static DEFINE_IDA(nvme_instance_ida);
-
-static int nvme_set_instance(struct nvme_dev *dev)
-{
-       int instance, error;
-
-       do {
-               if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL))
-                       return -ENODEV;
-
-               spin_lock(&dev_list_lock);
-               error = ida_get_new(&nvme_instance_ida, &instance);
-               spin_unlock(&dev_list_lock);
-       } while (error == -EAGAIN);
-
-       if (error)
-               return -ENODEV;
-
-       dev->instance = instance;
-       return 0;
-}
-
-static void nvme_release_instance(struct nvme_dev *dev)
-{
-       spin_lock(&dev_list_lock);
-       ida_remove(&nvme_instance_ida, dev->instance);
-       spin_unlock(&dev_list_lock);
-}
-
-static void nvme_free_dev(struct kref *kref)
+static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
 {
-       struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref);
+       struct nvme_dev *dev = to_nvme_dev(ctrl);
 
        put_device(dev->dev);
-       put_device(dev->device);
-       nvme_release_instance(dev);
        if (dev->tagset.tags)
                blk_mq_free_tag_set(&dev->tagset);
-       if (dev->admin_q)
-               blk_put_queue(dev->admin_q);
+       if (dev->ctrl.admin_q)
+               blk_put_queue(dev->ctrl.admin_q);
        kfree(dev->queues);
        kfree(dev->entry);
        kfree(dev);
 }
 
-static int nvme_dev_open(struct inode *inode, struct file *f)
-{
-       struct nvme_dev *dev;
-       int instance = iminor(inode);
-       int ret = -ENODEV;
-
-       spin_lock(&dev_list_lock);
-       list_for_each_entry(dev, &dev_list, node) {
-               if (dev->instance == instance) {
-                       if (!dev->admin_q) {
-                               ret = -EWOULDBLOCK;
-                               break;
-                       }
-                       if (!kref_get_unless_zero(&dev->kref))
-                               break;
-                       f->private_data = dev;
-                       ret = 0;
-                       break;
-               }
-       }
-       spin_unlock(&dev_list_lock);
-
-       return ret;
-}
-
-static int nvme_dev_release(struct inode *inode, struct file *f)
+static void nvme_reset_work(struct work_struct *work)
 {
-       struct nvme_dev *dev = f->private_data;
-       kref_put(&dev->kref, nvme_free_dev);
-       return 0;
-}
+       struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work);
+       int result;
 
-static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
-{
-       struct nvme_dev *dev = f->private_data;
-       struct nvme_ns *ns;
-
-       switch (cmd) {
-       case NVME_IOCTL_ADMIN_CMD:
-               return nvme_user_cmd(dev, NULL, (void __user *)arg);
-       case NVME_IOCTL_IO_CMD:
-               if (list_empty(&dev->namespaces))
-                       return -ENOTTY;
-               ns = list_first_entry(&dev->namespaces, struct nvme_ns, list);
-               return nvme_user_cmd(dev, ns, (void __user *)arg);
-       case NVME_IOCTL_RESET:
-               dev_warn(dev->dev, "resetting controller\n");
-               return nvme_reset(dev);
-       case NVME_IOCTL_SUBSYS_RESET:
-               return nvme_subsys_reset(dev);
-       default:
-               return -ENOTTY;
-       }
-}
+       if (WARN_ON(test_bit(NVME_CTRL_RESETTING, &dev->flags)))
+               goto out;
 
-static const struct file_operations nvme_dev_fops = {
-       .owner          = THIS_MODULE,
-       .open           = nvme_dev_open,
-       .release        = nvme_dev_release,
-       .unlocked_ioctl = nvme_dev_ioctl,
-       .compat_ioctl   = nvme_dev_ioctl,
-};
+       /*
+        * If we're called to reset a live controller first shut it down before
+        * moving on.
+        */
+       if (dev->bar)
+               nvme_dev_disable(dev, false);
 
-static void nvme_probe_work(struct work_struct *work)
-{
-       struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work);
-       bool start_thread = false;
-       int result;
+       set_bit(NVME_CTRL_RESETTING, &dev->flags);
 
        result = nvme_dev_map(dev);
        if (result)
@@ -3145,35 +1919,24 @@ static void nvme_probe_work(struct work_struct *work)
        if (result)
                goto unmap;
 
-       spin_lock(&dev_list_lock);
-       if (list_empty(&dev_list) && IS_ERR_OR_NULL(nvme_thread)) {
-               start_thread = true;
-               nvme_thread = NULL;
-       }
-       list_add(&dev->node, &dev_list);
-       spin_unlock(&dev_list_lock);
-
-       if (start_thread) {
-               nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");
-               wake_up_all(&nvme_kthread_wait);
-       } else
-               wait_event_killable(nvme_kthread_wait, nvme_thread);
-
-       if (IS_ERR_OR_NULL(nvme_thread)) {
-               result = nvme_thread ? PTR_ERR(nvme_thread) : -EINTR;
-               goto disable;
-       }
-
        nvme_init_queue(dev->queues[0], 0);
        result = nvme_alloc_admin_tags(dev);
        if (result)
                goto disable;
 
+       result = nvme_init_identify(&dev->ctrl);
+       if (result)
+               goto free_tags;
+
        result = nvme_setup_io_queues(dev);
        if (result)
                goto free_tags;
 
-       dev->event_limit = 1;
+       dev->ctrl.event_limit = NVME_NR_AEN_COMMANDS;
+
+       result = nvme_dev_list_add(dev);
+       if (result)
+               goto remove;
 
        /*
         * Keep the controller around but remove all namespaces if we don't have
@@ -3181,117 +1944,98 @@ static void nvme_probe_work(struct work_struct *work)
         */
        if (dev->online_queues < 2) {
                dev_warn(dev->dev, "IO queues not created\n");
-               nvme_dev_remove(dev);
+               nvme_remove_namespaces(&dev->ctrl);
        } else {
-               nvme_unfreeze_queues(dev);
+               nvme_start_queues(&dev->ctrl);
                nvme_dev_add(dev);
        }
 
+       clear_bit(NVME_CTRL_RESETTING, &dev->flags);
        return;
 
+ remove:
+       nvme_dev_list_remove(dev);
  free_tags:
        nvme_dev_remove_admin(dev);
-       blk_put_queue(dev->admin_q);
-       dev->admin_q = NULL;
+       blk_put_queue(dev->ctrl.admin_q);
+       dev->ctrl.admin_q = NULL;
        dev->queues[0]->tags = NULL;
  disable:
-       nvme_disable_queue(dev, 0);
-       nvme_dev_list_remove(dev);
+       nvme_disable_admin_queue(dev, false);
  unmap:
        nvme_dev_unmap(dev);
  out:
-       if (!work_busy(&dev->reset_work))
-               nvme_dead_ctrl(dev);
+       nvme_remove_dead_ctrl(dev);
 }
 
-static int nvme_remove_dead_ctrl(void *arg)
+static void nvme_remove_dead_ctrl_work(struct work_struct *work)
 {
-       struct nvme_dev *dev = (struct nvme_dev *)arg;
+       struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work);
        struct pci_dev *pdev = to_pci_dev(dev->dev);
 
        if (pci_get_drvdata(pdev))
                pci_stop_and_remove_bus_device_locked(pdev);
-       kref_put(&dev->kref, nvme_free_dev);
-       return 0;
+       nvme_put_ctrl(&dev->ctrl);
 }
 
-static void nvme_dead_ctrl(struct nvme_dev *dev)
+static void nvme_remove_dead_ctrl(struct nvme_dev *dev)
 {
-       dev_warn(dev->dev, "Device failed to resume\n");
-       kref_get(&dev->kref);
-       if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d",
-                                               dev->instance))) {
-               dev_err(dev->dev,
-                       "Failed to start controller remove task\n");
-               kref_put(&dev->kref, nvme_free_dev);
-       }
+       dev_warn(dev->dev, "Removing after probe failure\n");
+       kref_get(&dev->ctrl.kref);
+       if (!schedule_work(&dev->remove_work))
+               nvme_put_ctrl(&dev->ctrl);
 }
 
-static void nvme_reset_work(struct work_struct *ws)
+static int nvme_reset(struct nvme_dev *dev)
 {
-       struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work);
-       bool in_probe = work_busy(&dev->probe_work);
-
-       nvme_dev_shutdown(dev);
+       if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q))
+               return -ENODEV;
 
-       /* Synchronize with device probe so that work will see failure status
-        * and exit gracefully without trying to schedule another reset */
-       flush_work(&dev->probe_work);
+       if (!queue_work(nvme_workq, &dev->reset_work))
+               return -EBUSY;
 
-       /* Fail this device if reset occured during probe to avoid
-        * infinite initialization loops. */
-       if (in_probe) {
-               nvme_dead_ctrl(dev);
-               return;
-       }
-       /* Schedule device resume asynchronously so the reset work is available
-        * to cleanup errors that may occur during reinitialization */
-       schedule_work(&dev->probe_work);
+       flush_work(&dev->reset_work);
+       return 0;
 }
 
-static int __nvme_reset(struct nvme_dev *dev)
+static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
 {
-       if (work_pending(&dev->reset_work))
-               return -EBUSY;
-       list_del_init(&dev->node);
-       queue_work(nvme_workq, &dev->reset_work);
+       *val = readl(to_nvme_dev(ctrl)->bar + off);
        return 0;
 }
 
-static int nvme_reset(struct nvme_dev *dev)
+static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
 {
-       int ret;
-
-       if (!dev->admin_q || blk_queue_dying(dev->admin_q))
-               return -ENODEV;
-
-       spin_lock(&dev_list_lock);
-       ret = __nvme_reset(dev);
-       spin_unlock(&dev_list_lock);
-
-       if (!ret) {
-               flush_work(&dev->reset_work);
-               flush_work(&dev->probe_work);
-               return 0;
-       }
+       writel(val, to_nvme_dev(ctrl)->bar + off);
+       return 0;
+}
 
-       return ret;
+static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
+{
+       *val = readq(to_nvme_dev(ctrl)->bar + off);
+       return 0;
 }
 
-static ssize_t nvme_sysfs_reset(struct device *dev,
-                               struct device_attribute *attr, const char *buf,
-                               size_t count)
+static bool nvme_pci_io_incapable(struct nvme_ctrl *ctrl)
 {
-       struct nvme_dev *ndev = dev_get_drvdata(dev);
-       int ret;
+       struct nvme_dev *dev = to_nvme_dev(ctrl);
 
-       ret = nvme_reset(ndev);
-       if (ret < 0)
-               return ret;
+       return !dev->bar || dev->online_queues < 2;
+}
 
-       return count;
+static int nvme_pci_reset_ctrl(struct nvme_ctrl *ctrl)
+{
+       return nvme_reset(to_nvme_dev(ctrl));
 }
-static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);
+
+static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
+       .reg_read32             = nvme_pci_reg_read32,
+       .reg_write32            = nvme_pci_reg_write32,
+       .reg_read64             = nvme_pci_reg_read64,
+       .io_incapable           = nvme_pci_io_incapable,
+       .reset_ctrl             = nvme_pci_reset_ctrl,
+       .free_ctrl              = nvme_pci_free_ctrl,
+};
 
 static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
@@ -3314,46 +2058,30 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
        if (!dev->queues)
                goto free;
 
-       INIT_LIST_HEAD(&dev->namespaces);
-       INIT_WORK(&dev->reset_work, nvme_reset_work);
        dev->dev = get_device(&pdev->dev);
        pci_set_drvdata(pdev, dev);
-       result = nvme_set_instance(dev);
-       if (result)
-               goto put_pci;
+
+       INIT_LIST_HEAD(&dev->node);
+       INIT_WORK(&dev->scan_work, nvme_dev_scan);
+       INIT_WORK(&dev->reset_work, nvme_reset_work);
+       INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
+       mutex_init(&dev->shutdown_lock);
+       init_completion(&dev->ioq_wait);
 
        result = nvme_setup_prp_pools(dev);
        if (result)
-               goto release;
-
-       kref_init(&dev->kref);
-       dev->device = device_create(nvme_class, &pdev->dev,
-                               MKDEV(nvme_char_major, dev->instance),
-                               dev, "nvme%d", dev->instance);
-       if (IS_ERR(dev->device)) {
-               result = PTR_ERR(dev->device);
-               goto release_pools;
-       }
-       get_device(dev->device);
-       dev_set_drvdata(dev->device, dev);
+               goto put_pci;
 
-       result = device_create_file(dev->device, &dev_attr_reset_controller);
+       result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
+                       id->driver_data);
        if (result)
-               goto put_dev;
+               goto release_pools;
 
-       INIT_LIST_HEAD(&dev->node);
-       INIT_WORK(&dev->scan_work, nvme_dev_scan);
-       INIT_WORK(&dev->probe_work, nvme_probe_work);
-       schedule_work(&dev->probe_work);
+       queue_work(nvme_workq, &dev->reset_work);
        return 0;
 
- put_dev:
-       device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance));
-       put_device(dev->device);
  release_pools:
        nvme_release_prp_pools(dev);
- release:
-       nvme_release_instance(dev);
  put_pci:
        put_device(dev->dev);
  free:
@@ -3368,15 +2096,15 @@ static void nvme_reset_notify(struct pci_dev *pdev, bool prepare)
        struct nvme_dev *dev = pci_get_drvdata(pdev);
 
        if (prepare)
-               nvme_dev_shutdown(dev);
+               nvme_dev_disable(dev, false);
        else
-               schedule_work(&dev->probe_work);
+               queue_work(nvme_workq, &dev->reset_work);
 }
 
 static void nvme_shutdown(struct pci_dev *pdev)
 {
        struct nvme_dev *dev = pci_get_drvdata(pdev);
-       nvme_dev_shutdown(dev);
+       nvme_dev_disable(dev, true);
 }
 
 static void nvme_remove(struct pci_dev *pdev)
@@ -3388,34 +2116,25 @@ static void nvme_remove(struct pci_dev *pdev)
        spin_unlock(&dev_list_lock);
 
        pci_set_drvdata(pdev, NULL);
-       flush_work(&dev->probe_work);
        flush_work(&dev->reset_work);
        flush_work(&dev->scan_work);
-       device_remove_file(dev->device, &dev_attr_reset_controller);
-       nvme_dev_remove(dev);
-       nvme_dev_shutdown(dev);
+       nvme_remove_namespaces(&dev->ctrl);
+       nvme_uninit_ctrl(&dev->ctrl);
+       nvme_dev_disable(dev, true);
        nvme_dev_remove_admin(dev);
-       device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance));
        nvme_free_queues(dev, 0);
        nvme_release_cmb(dev);
        nvme_release_prp_pools(dev);
-       kref_put(&dev->kref, nvme_free_dev);
+       nvme_put_ctrl(&dev->ctrl);
 }
 
-/* These functions are yet to be implemented */
-#define nvme_error_detected NULL
-#define nvme_dump_registers NULL
-#define nvme_link_reset NULL
-#define nvme_slot_reset NULL
-#define nvme_error_resume NULL
-
 #ifdef CONFIG_PM_SLEEP
 static int nvme_suspend(struct device *dev)
 {
        struct pci_dev *pdev = to_pci_dev(dev);
        struct nvme_dev *ndev = pci_get_drvdata(pdev);
 
-       nvme_dev_shutdown(ndev);
+       nvme_dev_disable(ndev, true);
        return 0;
 }
 
@@ -3424,17 +2143,53 @@ static int nvme_resume(struct device *dev)
        struct pci_dev *pdev = to_pci_dev(dev);
        struct nvme_dev *ndev = pci_get_drvdata(pdev);
 
-       schedule_work(&ndev->probe_work);
+       queue_work(nvme_workq, &ndev->reset_work);
        return 0;
 }
 #endif
 
 static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume);
 
+static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev,
+                                               pci_channel_state_t state)
+{
+       struct nvme_dev *dev = pci_get_drvdata(pdev);
+
+       /*
+        * A frozen channel requires a reset. When detected, this method will
+        * shutdown the controller to quiesce. The controller will be restarted
+        * after the slot reset through driver's slot_reset callback.
+        */
+       dev_warn(&pdev->dev, "error detected: state:%d\n", state);
+       switch (state) {
+       case pci_channel_io_normal:
+               return PCI_ERS_RESULT_CAN_RECOVER;
+       case pci_channel_io_frozen:
+               nvme_dev_disable(dev, false);
+               return PCI_ERS_RESULT_NEED_RESET;
+       case pci_channel_io_perm_failure:
+               return PCI_ERS_RESULT_DISCONNECT;
+       }
+       return PCI_ERS_RESULT_NEED_RESET;
+}
+
+static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev)
+{
+       struct nvme_dev *dev = pci_get_drvdata(pdev);
+
+       dev_info(&pdev->dev, "restart after slot reset\n");
+       pci_restore_state(pdev);
+       queue_work(nvme_workq, &dev->reset_work);
+       return PCI_ERS_RESULT_RECOVERED;
+}
+
+static void nvme_error_resume(struct pci_dev *pdev)
+{
+       pci_cleanup_aer_uncorrect_error_status(pdev);
+}
+
 static const struct pci_error_handlers nvme_err_handler = {
        .error_detected = nvme_error_detected,
-       .mmio_enabled   = nvme_dump_registers,
-       .link_reset     = nvme_link_reset,
        .slot_reset     = nvme_slot_reset,
        .resume         = nvme_error_resume,
        .reset_notify   = nvme_reset_notify,
@@ -3444,6 +2199,10 @@ static const struct pci_error_handlers nvme_err_handler = {
 #define PCI_CLASS_STORAGE_EXPRESS      0x010802
 
 static const struct pci_device_id nvme_id_table[] = {
+       { PCI_VDEVICE(INTEL, 0x0953),
+               .driver_data = NVME_QUIRK_STRIPE_SIZE, },
+       { PCI_VDEVICE(INTEL, 0x5845),   /* Qemu emulated controller */
+               .driver_data = NVME_QUIRK_IDENTIFY_CNS, },
        { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
        { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
        { 0, }
@@ -3468,40 +2227,21 @@ static int __init nvme_init(void)
 
        init_waitqueue_head(&nvme_kthread_wait);
 
-       nvme_workq = create_singlethread_workqueue("nvme");
+       nvme_workq = alloc_workqueue("nvme", WQ_UNBOUND | WQ_MEM_RECLAIM, 0);
        if (!nvme_workq)
                return -ENOMEM;
 
-       result = register_blkdev(nvme_major, "nvme");
+       result = nvme_core_init();
        if (result < 0)
                goto kill_workq;
-       else if (result > 0)
-               nvme_major = result;
-
-       result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme",
-                                                       &nvme_dev_fops);
-       if (result < 0)
-               goto unregister_blkdev;
-       else if (result > 0)
-               nvme_char_major = result;
-
-       nvme_class = class_create(THIS_MODULE, "nvme");
-       if (IS_ERR(nvme_class)) {
-               result = PTR_ERR(nvme_class);
-               goto unregister_chrdev;
-       }
 
        result = pci_register_driver(&nvme_driver);
        if (result)
-               goto destroy_class;
+               goto core_exit;
        return 0;
 
- destroy_class:
-       class_destroy(nvme_class);
- unregister_chrdev:
-       __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
- unregister_blkdev:
-       unregister_blkdev(nvme_major, "nvme");
+ core_exit:
+       nvme_core_exit();
  kill_workq:
        destroy_workqueue(nvme_workq);
        return result;
@@ -3510,10 +2250,8 @@ static int __init nvme_init(void)
 static void __exit nvme_exit(void)
 {
        pci_unregister_driver(&nvme_driver);
-       unregister_blkdev(nvme_major, "nvme");
+       nvme_core_exit();
        destroy_workqueue(nvme_workq);
-       class_destroy(nvme_class);
-       __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
        BUG_ON(nvme_thread && !IS_ERR(nvme_thread));
        _nvme_check_size();
 }
index c3d8d3887a313bc8ee914292a83d6c6dcbf41afa..e947e298a737b17a5267071a9102dbc7da72f6cb 100644 (file)
@@ -524,7 +524,7 @@ static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns,
                                        struct sg_io_hdr *hdr, u8 *inq_response,
                                        int alloc_len)
 {
-       struct nvme_dev *dev = ns->dev;
+       struct nvme_ctrl *ctrl = ns->ctrl;
        struct nvme_id_ns *id_ns;
        int res;
        int nvme_sc;
@@ -532,10 +532,10 @@ static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns,
        u8 resp_data_format = 0x02;
        u8 protect;
        u8 cmdque = 0x01 << 1;
-       u8 fw_offset = sizeof(dev->firmware_rev);
+       u8 fw_offset = sizeof(ctrl->firmware_rev);
 
        /* nvme ns identify - use DPS value for PROTECT field */
-       nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns);
+       nvme_sc = nvme_identify_ns(ctrl, ns->ns_id, &id_ns);
        res = nvme_trans_status_code(hdr, nvme_sc);
        if (res)
                return res;
@@ -553,12 +553,12 @@ static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns,
        inq_response[5] = protect;      /* sccs=0 | acc=0 | tpgs=0 | pc3=0 */
        inq_response[7] = cmdque;       /* wbus16=0 | sync=0 | vs=0 */
        strncpy(&inq_response[8], "NVMe    ", 8);
-       strncpy(&inq_response[16], dev->model, 16);
+       strncpy(&inq_response[16], ctrl->model, 16);
 
-       while (dev->firmware_rev[fw_offset - 1] == ' ' && fw_offset > 4)
+       while (ctrl->firmware_rev[fw_offset - 1] == ' ' && fw_offset > 4)
                fw_offset--;
        fw_offset -= 4;
-       strncpy(&inq_response[32], dev->firmware_rev + fw_offset, 4);
+       strncpy(&inq_response[32], ctrl->firmware_rev + fw_offset, 4);
 
        xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH);
        return nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
@@ -588,82 +588,113 @@ static int nvme_trans_unit_serial_page(struct nvme_ns *ns,
                                        struct sg_io_hdr *hdr, u8 *inq_response,
                                        int alloc_len)
 {
-       struct nvme_dev *dev = ns->dev;
        int xfer_len;
 
        memset(inq_response, 0, STANDARD_INQUIRY_LENGTH);
        inq_response[1] = INQ_UNIT_SERIAL_NUMBER_PAGE; /* Page Code */
        inq_response[3] = INQ_SERIAL_NUMBER_LENGTH;    /* Page Length */
-       strncpy(&inq_response[4], dev->serial, INQ_SERIAL_NUMBER_LENGTH);
+       strncpy(&inq_response[4], ns->ctrl->serial, INQ_SERIAL_NUMBER_LENGTH);
 
        xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH);
        return nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
 }
 
-static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
-                                       u8 *inq_response, int alloc_len)
+static int nvme_fill_device_id_eui64(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+               u8 *inq_response, int alloc_len)
 {
-       struct nvme_dev *dev = ns->dev;
-       int res;
-       int nvme_sc;
-       int xfer_len;
-       __be32 tmp_id = cpu_to_be32(ns->ns_id);
+       struct nvme_id_ns *id_ns;
+       int nvme_sc, res;
+       size_t len;
+       void *eui;
 
-       memset(inq_response, 0, alloc_len);
-       inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE;    /* Page Code */
-       if (readl(&dev->bar->vs) >= NVME_VS(1, 1)) {
-               struct nvme_id_ns *id_ns;
-               void *eui;
-               int len;
+       nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns);
+       res = nvme_trans_status_code(hdr, nvme_sc);
+       if (res)
+               return res;
 
-               nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns);
-               res = nvme_trans_status_code(hdr, nvme_sc);
-               if (res)
-                       return res;
+       eui = id_ns->eui64;
+       len = sizeof(id_ns->eui64);
 
-               eui = id_ns->eui64;
-               len = sizeof(id_ns->eui64);
-               if (readl(&dev->bar->vs) >= NVME_VS(1, 2)) {
-                       if (bitmap_empty(eui, len * 8)) {
-                               eui = id_ns->nguid;
-                               len = sizeof(id_ns->nguid);
-                       }
-               }
+       if (ns->ctrl->vs >= NVME_VS(1, 2)) {
                if (bitmap_empty(eui, len * 8)) {
-                       kfree(id_ns);
-                       goto scsi_string;
+                       eui = id_ns->nguid;
+                       len = sizeof(id_ns->nguid);
                }
+       }
 
-               inq_response[3] = 4 + len; /* Page Length */
-               /* Designation Descriptor start */
-               inq_response[4] = 0x01;    /* Proto ID=0h | Code set=1h */
-               inq_response[5] = 0x02;    /* PIV=0b | Asso=00b | Designator Type=2h */
-               inq_response[6] = 0x00;    /* Rsvd */
-               inq_response[7] = len;     /* Designator Length */
-               memcpy(&inq_response[8], eui, len);
-               kfree(id_ns);
-       } else {
- scsi_string:
-               if (alloc_len < 72) {
-                       return nvme_trans_completion(hdr,
-                                       SAM_STAT_CHECK_CONDITION,
-                                       ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
-                                       SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
-               }
-               inq_response[3] = 0x48;    /* Page Length */
-               /* Designation Descriptor start */
-               inq_response[4] = 0x03;    /* Proto ID=0h | Code set=3h */
-               inq_response[5] = 0x08;    /* PIV=0b | Asso=00b | Designator Type=8h */
-               inq_response[6] = 0x00;    /* Rsvd */
-               inq_response[7] = 0x44;    /* Designator Length */
-
-               sprintf(&inq_response[8], "%04x", to_pci_dev(dev->dev)->vendor);
-               memcpy(&inq_response[12], dev->model, sizeof(dev->model));
-               sprintf(&inq_response[52], "%04x", tmp_id);
-               memcpy(&inq_response[56], dev->serial, sizeof(dev->serial));
+       if (bitmap_empty(eui, len * 8)) {
+               res = -EOPNOTSUPP;
+               goto out_free_id;
        }
-       xfer_len = alloc_len;
-       return nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
+
+       memset(inq_response, 0, alloc_len);
+       inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE;
+       inq_response[3] = 4 + len; /* Page Length */
+
+       /* Designation Descriptor start */
+       inq_response[4] = 0x01; /* Proto ID=0h | Code set=1h */
+       inq_response[5] = 0x02; /* PIV=0b | Asso=00b | Designator Type=2h */
+       inq_response[6] = 0x00; /* Rsvd */
+       inq_response[7] = len;  /* Designator Length */
+       memcpy(&inq_response[8], eui, len);
+
+       res = nvme_trans_copy_to_user(hdr, inq_response, alloc_len);
+out_free_id:
+       kfree(id_ns);
+       return res;
+}
+
+static int nvme_fill_device_id_scsi_string(struct nvme_ns *ns,
+               struct sg_io_hdr *hdr, u8 *inq_response, int alloc_len)
+{
+       struct nvme_ctrl *ctrl = ns->ctrl;
+       struct nvme_id_ctrl *id_ctrl;
+       int nvme_sc, res;
+
+       if (alloc_len < 72) {
+               return nvme_trans_completion(hdr,
+                               SAM_STAT_CHECK_CONDITION,
+                               ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+                               SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+       }
+
+       nvme_sc = nvme_identify_ctrl(ctrl, &id_ctrl);
+       res = nvme_trans_status_code(hdr, nvme_sc);
+       if (res)
+               return res;
+
+       memset(inq_response, 0, alloc_len);
+       inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE;
+       inq_response[3] = 0x48; /* Page Length */
+
+       /* Designation Descriptor start */
+       inq_response[4] = 0x03; /* Proto ID=0h | Code set=3h */
+       inq_response[5] = 0x08; /* PIV=0b | Asso=00b | Designator Type=8h */
+       inq_response[6] = 0x00; /* Rsvd */
+       inq_response[7] = 0x44; /* Designator Length */
+
+       sprintf(&inq_response[8], "%04x", le16_to_cpu(id_ctrl->vid));
+       memcpy(&inq_response[12], ctrl->model, sizeof(ctrl->model));
+       sprintf(&inq_response[52], "%04x", cpu_to_be32(ns->ns_id));
+       memcpy(&inq_response[56], ctrl->serial, sizeof(ctrl->serial));
+
+       res = nvme_trans_copy_to_user(hdr, inq_response, alloc_len);
+       kfree(id_ctrl);
+       return res;
+}
+
+static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+                                       u8 *resp, int alloc_len)
+{
+       int res;
+
+       if (ns->ctrl->vs >= NVME_VS(1, 1)) {
+               res = nvme_fill_device_id_eui64(ns, hdr, resp, alloc_len);
+               if (res != -EOPNOTSUPP)
+                       return res;
+       }
+
+       return nvme_fill_device_id_scsi_string(ns, hdr, resp, alloc_len);
 }
 
 static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
@@ -672,7 +703,7 @@ static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
        u8 *inq_response;
        int res;
        int nvme_sc;
-       struct nvme_dev *dev = ns->dev;
+       struct nvme_ctrl *ctrl = ns->ctrl;
        struct nvme_id_ctrl *id_ctrl;
        struct nvme_id_ns *id_ns;
        int xfer_len;
@@ -688,7 +719,7 @@ static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
        if (inq_response == NULL)
                return -ENOMEM;
 
-       nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns);
+       nvme_sc = nvme_identify_ns(ctrl, ns->ns_id, &id_ns);
        res = nvme_trans_status_code(hdr, nvme_sc);
        if (res)
                goto out_free_inq;
@@ -704,7 +735,7 @@ static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
        app_chk = protect << 1;
        ref_chk = protect;
 
-       nvme_sc = nvme_identify_ctrl(dev, &id_ctrl);
+       nvme_sc = nvme_identify_ctrl(ctrl, &id_ctrl);
        res = nvme_trans_status_code(hdr, nvme_sc);
        if (res)
                goto out_free_inq;
@@ -815,7 +846,6 @@ static int nvme_trans_log_info_exceptions(struct nvme_ns *ns,
        int res;
        int xfer_len;
        u8 *log_response;
-       struct nvme_dev *dev = ns->dev;
        struct nvme_smart_log *smart_log;
        u8 temp_c;
        u16 temp_k;
@@ -824,7 +854,7 @@ static int nvme_trans_log_info_exceptions(struct nvme_ns *ns,
        if (log_response == NULL)
                return -ENOMEM;
 
-       res = nvme_get_log_page(dev, &smart_log);
+       res = nvme_get_log_page(ns->ctrl, &smart_log);
        if (res < 0)
                goto out_free_response;
 
@@ -862,7 +892,6 @@ static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr,
        int res;
        int xfer_len;
        u8 *log_response;
-       struct nvme_dev *dev = ns->dev;
        struct nvme_smart_log *smart_log;
        u32 feature_resp;
        u8 temp_c_cur, temp_c_thresh;
@@ -872,7 +901,7 @@ static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr,
        if (log_response == NULL)
                return -ENOMEM;
 
-       res = nvme_get_log_page(dev, &smart_log);
+       res = nvme_get_log_page(ns->ctrl, &smart_log);
        if (res < 0)
                goto out_free_response;
 
@@ -886,7 +915,7 @@ static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr,
        kfree(smart_log);
 
        /* Get Features for Temp Threshold */
-       res = nvme_get_features(dev, NVME_FEAT_TEMP_THRESH, 0, 0,
+       res = nvme_get_features(ns->ctrl, NVME_FEAT_TEMP_THRESH, 0, 0,
                                                                &feature_resp);
        if (res != NVME_SC_SUCCESS)
                temp_c_thresh = LOG_TEMP_UNKNOWN;
@@ -948,7 +977,6 @@ static int nvme_trans_fill_blk_desc(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 {
        int res;
        int nvme_sc;
-       struct nvme_dev *dev = ns->dev;
        struct nvme_id_ns *id_ns;
        u8 flbas;
        u32 lba_length;
@@ -958,7 +986,7 @@ static int nvme_trans_fill_blk_desc(struct nvme_ns *ns, struct sg_io_hdr *hdr,
        else if (llbaa > 0 && len < MODE_PAGE_LLBAA_BLK_DES_LEN)
                return -EINVAL;
 
-       nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns);
+       nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns);
        res = nvme_trans_status_code(hdr, nvme_sc);
        if (res)
                return res;
@@ -1014,14 +1042,13 @@ static int nvme_trans_fill_caching_page(struct nvme_ns *ns,
 {
        int res = 0;
        int nvme_sc;
-       struct nvme_dev *dev = ns->dev;
        u32 feature_resp;
        u8 vwc;
 
        if (len < MODE_PAGE_CACHING_LEN)
                return -EINVAL;
 
-       nvme_sc = nvme_get_features(dev, NVME_FEAT_VOLATILE_WC, 0, 0,
+       nvme_sc = nvme_get_features(ns->ctrl, NVME_FEAT_VOLATILE_WC, 0, 0,
                                                                &feature_resp);
        res = nvme_trans_status_code(hdr, nvme_sc);
        if (res)
@@ -1207,12 +1234,11 @@ static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 {
        int res;
        int nvme_sc;
-       struct nvme_dev *dev = ns->dev;
        struct nvme_id_ctrl *id_ctrl;
        int lowest_pow_st;      /* max npss = lowest power consumption */
        unsigned ps_desired = 0;
 
-       nvme_sc = nvme_identify_ctrl(dev, &id_ctrl);
+       nvme_sc = nvme_identify_ctrl(ns->ctrl, &id_ctrl);
        res = nvme_trans_status_code(hdr, nvme_sc);
        if (res)
                return res;
@@ -1256,7 +1282,7 @@ static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr,
                                SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
                break;
        }
-       nvme_sc = nvme_set_features(dev, NVME_FEAT_POWER_MGMT, ps_desired, 0,
+       nvme_sc = nvme_set_features(ns->ctrl, NVME_FEAT_POWER_MGMT, ps_desired, 0,
                                    NULL);
        return nvme_trans_status_code(hdr, nvme_sc);
 }
@@ -1280,7 +1306,6 @@ static int nvme_trans_send_download_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr
                                        u8 buffer_id)
 {
        int nvme_sc;
-       struct nvme_dev *dev = ns->dev;
        struct nvme_command c;
 
        if (hdr->iovec_count > 0) {
@@ -1297,7 +1322,7 @@ static int nvme_trans_send_download_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr
        c.dlfw.numd = cpu_to_le32((tot_len/BYTES_TO_DWORDS) - 1);
        c.dlfw.offset = cpu_to_le32(offset/BYTES_TO_DWORDS);
 
-       nvme_sc = __nvme_submit_sync_cmd(dev->admin_q, &c, NULL,
+       nvme_sc = nvme_submit_user_cmd(ns->ctrl->admin_q, &c,
                        hdr->dxferp, tot_len, NULL, 0);
        return nvme_trans_status_code(hdr, nvme_sc);
 }
@@ -1364,14 +1389,13 @@ static int nvme_trans_modesel_get_mp(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 {
        int res = 0;
        int nvme_sc;
-       struct nvme_dev *dev = ns->dev;
        unsigned dword11;
 
        switch (page_code) {
        case MODE_PAGE_CACHING:
                dword11 = ((mode_page[2] & CACHING_MODE_PAGE_WCE_MASK) ? 1 : 0);
-               nvme_sc = nvme_set_features(dev, NVME_FEAT_VOLATILE_WC, dword11,
-                                           0, NULL);
+               nvme_sc = nvme_set_features(ns->ctrl, NVME_FEAT_VOLATILE_WC,
+                                           dword11, 0, NULL);
                res = nvme_trans_status_code(hdr, nvme_sc);
                break;
        case MODE_PAGE_CONTROL:
@@ -1473,7 +1497,6 @@ static int nvme_trans_fmt_set_blk_size_count(struct nvme_ns *ns,
 {
        int res = 0;
        int nvme_sc;
-       struct nvme_dev *dev = ns->dev;
        u8 flbas;
 
        /*
@@ -1486,7 +1509,7 @@ static int nvme_trans_fmt_set_blk_size_count(struct nvme_ns *ns,
        if (ns->mode_select_num_blocks == 0 || ns->mode_select_block_len == 0) {
                struct nvme_id_ns *id_ns;
 
-               nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns);
+               nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns);
                res = nvme_trans_status_code(hdr, nvme_sc);
                if (res)
                        return res;
@@ -1570,7 +1593,6 @@ static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 {
        int res;
        int nvme_sc;
-       struct nvme_dev *dev = ns->dev;
        struct nvme_id_ns *id_ns;
        u8 i;
        u8 flbas, nlbaf;
@@ -1579,7 +1601,7 @@ static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr,
        struct nvme_command c;
 
        /* Loop thru LBAF's in id_ns to match reqd lbaf, put in cdw10 */
-       nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns);
+       nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns);
        res = nvme_trans_status_code(hdr, nvme_sc);
        if (res)
                return res;
@@ -1611,7 +1633,7 @@ static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr,
        c.format.nsid = cpu_to_le32(ns->ns_id);
        c.format.cdw10 = cpu_to_le32(cdw10);
 
-       nvme_sc = nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0);
+       nvme_sc = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, NULL, 0);
        res = nvme_trans_status_code(hdr, nvme_sc);
 
        kfree(id_ns);
@@ -1704,7 +1726,7 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr,
                        nvme_sc = NVME_SC_LBA_RANGE;
                        break;
                }
-               nvme_sc = __nvme_submit_sync_cmd(ns->queue, &c, NULL,
+               nvme_sc = nvme_submit_user_cmd(ns->queue, &c,
                                next_mapping_addr, unit_len, NULL, 0);
                if (nvme_sc)
                        break;
@@ -2040,7 +2062,6 @@ static int nvme_trans_read_capacity(struct nvme_ns *ns, struct sg_io_hdr *hdr,
        u32 alloc_len;
        u32 resp_size;
        u32 xfer_len;
-       struct nvme_dev *dev = ns->dev;
        struct nvme_id_ns *id_ns;
        u8 *response;
 
@@ -2052,7 +2073,7 @@ static int nvme_trans_read_capacity(struct nvme_ns *ns, struct sg_io_hdr *hdr,
                resp_size = READ_CAP_10_RESP_SIZE;
        }
 
-       nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns);
+       nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns);
        res = nvme_trans_status_code(hdr, nvme_sc);
        if (res)
                return res;     
@@ -2080,7 +2101,6 @@ static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr,
        int nvme_sc;
        u32 alloc_len, xfer_len, resp_size;
        u8 *response;
-       struct nvme_dev *dev = ns->dev;
        struct nvme_id_ctrl *id_ctrl;
        u32 ll_length, lun_id;
        u8 lun_id_offset = REPORT_LUNS_FIRST_LUN_OFFSET;
@@ -2094,7 +2114,7 @@ static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr,
        case ALL_LUNS_RETURNED:
        case ALL_WELL_KNOWN_LUNS_RETURNED:
        case RESTRICTED_LUNS_RETURNED:
-               nvme_sc = nvme_identify_ctrl(dev, &id_ctrl);
+               nvme_sc = nvme_identify_ctrl(ns->ctrl, &id_ctrl);
                res = nvme_trans_status_code(hdr, nvme_sc);
                if (res)
                        return res;
@@ -2295,9 +2315,7 @@ static int nvme_trans_test_unit_ready(struct nvme_ns *ns,
                                        struct sg_io_hdr *hdr,
                                        u8 *cmd)
 {
-       struct nvme_dev *dev = ns->dev;
-
-       if (!(readl(&dev->bar->csts) & NVME_CSTS_RDY))
+       if (nvme_ctrl_ready(ns->ctrl))
                return nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
                                            NOT_READY, SCSI_ASC_LUN_NOT_READY,
                                            SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
index dd92c5edf219453aac0bb98d161fc1136395f1c7..b48ac6300c792d4336487f841a2267a5b883c6f5 100644 (file)
@@ -138,22 +138,22 @@ static int __oprofilefs_create_file(struct dentry *root, char const *name,
        struct dentry *dentry;
        struct inode *inode;
 
-       mutex_lock(&d_inode(root)->i_mutex);
+       inode_lock(d_inode(root));
        dentry = d_alloc_name(root, name);
        if (!dentry) {
-               mutex_unlock(&d_inode(root)->i_mutex);
+               inode_unlock(d_inode(root));
                return -ENOMEM;
        }
        inode = oprofilefs_get_inode(root->d_sb, S_IFREG | perm);
        if (!inode) {
                dput(dentry);
-               mutex_unlock(&d_inode(root)->i_mutex);
+               inode_unlock(d_inode(root));
                return -ENOMEM;
        }
        inode->i_fop = fops;
        inode->i_private = priv;
        d_add(dentry, inode);
-       mutex_unlock(&d_inode(root)->i_mutex);
+       inode_unlock(d_inode(root));
        return 0;
 }
 
@@ -215,22 +215,22 @@ struct dentry *oprofilefs_mkdir(struct dentry *parent, char const *name)
        struct dentry *dentry;
        struct inode *inode;
 
-       mutex_lock(&d_inode(parent)->i_mutex);
+       inode_lock(d_inode(parent));
        dentry = d_alloc_name(parent, name);
        if (!dentry) {
-               mutex_unlock(&d_inode(parent)->i_mutex);
+               inode_unlock(d_inode(parent));
                return NULL;
        }
        inode = oprofilefs_get_inode(parent->d_sb, S_IFDIR | 0755);
        if (!inode) {
                dput(dentry);
-               mutex_unlock(&d_inode(parent)->i_mutex);
+               inode_unlock(d_inode(parent));
                return NULL;
        }
        inode->i_op = &simple_dir_inode_operations;
        inode->i_fop = &simple_dir_operations;
        d_add(dentry, inode);
-       mutex_unlock(&d_inode(parent)->i_mutex);
+       inode_unlock(d_inode(parent));
        return dentry;
 }
 
index 2940bd769936cd7f75d2d20adc324914b0df4b84..25aba1613e2157f7a2e468007c202a16015b3f40 100644 (file)
@@ -1045,6 +1045,9 @@ static int tw_chrdev_open(struct inode *inode, struct file *file)
 static const struct file_operations tw_fops = {
        .owner          = THIS_MODULE,
        .unlocked_ioctl = tw_chrdev_ioctl,
+#ifdef CONFIG_COMPAT
+       .compat_ioctl   = tw_chrdev_ioctl,
+#endif
        .open           = tw_chrdev_open,
        .release        = NULL,
        .llseek         = noop_llseek,
index c1fe0d2f90ca353722cf7e2b16c2bd2c9b47601b..e2f31c93717db6e6a248d58ffaff6b8a2ba2d23f 100644 (file)
@@ -1106,6 +1106,7 @@ config SCSI_IPR
        tristate "IBM Power Linux RAID adapter support"
        depends on PCI && SCSI && ATA
        select FW_LOADER
+       select IRQ_POLL
        ---help---
          This driver supports the IBM Power Linux family RAID adapters.
          This includes IBM pSeries 5712, 5703, 5709, and 570A, as well
@@ -1620,23 +1621,6 @@ config ATARI_SCSI
          ST-DMA, replacing ACSI).  It does NOT support other schemes, like
          in the Hades (without DMA).
 
-config ATARI_SCSI_TOSHIBA_DELAY
-       bool "Long delays for Toshiba CD-ROMs"
-       depends on ATARI_SCSI
-       help
-         This option increases the delay after a SCSI arbitration to
-         accommodate some flaky Toshiba CD-ROM drives. Say Y if you intend to
-         use a Toshiba CD-ROM drive; otherwise, the option is not needed and
-         would impact performance a bit, so say N.
-
-config ATARI_SCSI_RESET_BOOT
-       bool "Reset SCSI-devices at boottime"
-       depends on ATARI_SCSI
-       help
-         Reset the devices on your Atari whenever it boots.  This makes the
-         boot process fractionally longer but may assist recovery from errors
-         that leave the devices with SCSI operations partway completed.
-
 config MAC_SCSI
        tristate "Macintosh NCR5380 SCSI"
        depends on MAC && SCSI=y
index a777e5c412df262b3f7fc72b55676d681ade9cb8..d72867257346eebfe885bb189eb7e5a61cd6fbbe 100644 (file)
@@ -1,17 +1,17 @@
-/* 
+/*
  * NCR 5380 generic driver routines.  These should make it *trivial*
- *      to implement 5380 SCSI drivers under Linux with a non-trantor
- *      architecture.
+ * to implement 5380 SCSI drivers under Linux with a non-trantor
+ * architecture.
  *
- *      Note that these routines also work with NR53c400 family chips.
+ * Note that these routines also work with NR53c400 family chips.
  *
  * Copyright 1993, Drew Eckhardt
- *      Visionary Computing 
- *      (Unix and Linux consulting and custom programming)
- *      drew@colorado.edu
- *      +1 (303) 666-5836
+ * Visionary Computing
+ * (Unix and Linux consulting and custom programming)
+ * drew@colorado.edu
+ * +1 (303) 666-5836
  *
- * For more information, please consult 
+ * For more information, please consult
  *
  * NCR 5380 Family
  * SCSI Protocol Controller
  */
 
 /*
- * Revision 1.10 1998/9/2      Alan Cox
- *                             (alan@lxorguk.ukuu.org.uk)
- * Fixed up the timer lockups reported so far. Things still suck. Looking 
- * forward to 2.3 and per device request queues. Then it'll be possible to
- * SMP thread this beast and improve life no end.
- * Revision 1.9  1997/7/27     Ronald van Cuijlenborg
- *                             (ronald.van.cuijlenborg@tip.nl or nutty@dds.nl)
- * (hopefully) fixed and enhanced USLEEP
- * added support for DTC3181E card (for Mustek scanner)
- *
-
- * Revision 1.8                        Ingmar Baumgart
- *                             (ingmar@gonzo.schwaben.de)
- * added support for NCR53C400a card
- *
-
- * Revision 1.7  1996/3/2       Ray Van Tassle (rayvt@comm.mot.com)
- * added proc_info
- * added support needed for DTC 3180/3280
- * fixed a couple of bugs
- *
-
- * Revision 1.5  1994/01/19  09:14:57  drew
- * Fixed udelay() hack that was being used on DATAOUT phases
- * instead of a proper wait for the final handshake.
- *
- * Revision 1.4  1994/01/19  06:44:25  drew
- * *** empty log message ***
- *
- * Revision 1.3  1994/01/19  05:24:40  drew
- * Added support for TCR LAST_BYTE_SENT bit.
- *
- * Revision 1.2  1994/01/15  06:14:11  drew
- * REAL DMA support, bug fixes.
- *
- * Revision 1.1  1994/01/15  06:00:54  drew
- * Initial revision
- *
+ * With contributions from Ray Van Tassle, Ingmar Baumgart,
+ * Ronald van Cuijlenborg, Alan Cox and others.
  */
 
 /*
- * Further development / testing that should be done : 
+ * Further development / testing that should be done :
  * 1.  Cleanup the NCR5380_transfer_dma function and DMA operation complete
- *     code so that everything does the same thing that's done at the 
- *     end of a pseudo-DMA read operation.
+ * code so that everything does the same thing that's done at the
+ * end of a pseudo-DMA read operation.
  *
  * 2.  Fix REAL_DMA (interrupt driven, polled works fine) -
- *     basically, transfer size needs to be reduced by one 
- *     and the last byte read as is done with PSEUDO_DMA.
- * 
- * 4.  Test SCSI-II tagged queueing (I have no devices which support 
- *      tagged queueing)
- *
- * 5.  Test linked command handling code after Eric is ready with 
- *      the high level code.
+ * basically, transfer size needs to be reduced by one
+ * and the last byte read as is done with PSEUDO_DMA.
+ *
+ * 4.  Test SCSI-II tagged queueing (I have no devices which support
+ * tagged queueing)
  */
-#include <scsi/scsi_dbg.h>
-#include <scsi/scsi_transport_spi.h>
-
-#if (NDEBUG & NDEBUG_LISTS)
-#define LIST(x,y) {printk("LINE:%d   Adding %p to %p\n", __LINE__, (void*)(x), (void*)(y)); if ((x)==(y)) udelay(5); }
-#define REMOVE(w,x,y,z) {printk("LINE:%d   Removing: %p->%p  %p->%p \n", __LINE__, (void*)(w), (void*)(x), (void*)(y), (void*)(z)); if ((x)==(y)) udelay(5); }
-#else
-#define LIST(x,y)
-#define REMOVE(w,x,y,z)
-#endif
 
 #ifndef notyet
-#undef LINKED
 #undef REAL_DMA
 #endif
 
-#ifdef REAL_DMA_POLL
-#undef READ_OVERRUNS
-#define READ_OVERRUNS
-#endif
-
 #ifdef BOARD_REQUIRES_NO_DELAY
 #define io_recovery_delay(x)
 #else
 /*
  * Design
  *
- * This is a generic 5380 driver.  To use it on a different platform, 
+ * This is a generic 5380 driver.  To use it on a different platform,
  * one simply writes appropriate system specific macros (ie, data
- * transfer - some PC's will use the I/O bus, 68K's must use 
+ * transfer - some PC's will use the I/O bus, 68K's must use
  * memory mapped) and drops this file in their 'C' wrapper.
  *
- * (Note from hch:  unfortunately it was not enough for the different
- * m68k folks and instead of improving this driver they copied it
- * and hacked it up for their needs.  As a consequence they lost
- * most updates to this driver.  Maybe someone will fix all these
- * drivers to use a common core one day..)
- *
- * As far as command queueing, two queues are maintained for 
+ * As far as command queueing, two queues are maintained for
  * each 5380 in the system - commands that haven't been issued yet,
- * and commands that are currently executing.  This means that an 
- * unlimited number of commands may be queued, letting 
- * more commands propagate from the higher driver levels giving higher 
- * throughput.  Note that both I_T_L and I_T_L_Q nexuses are supported, 
- * allowing multiple commands to propagate all the way to a SCSI-II device 
+ * and commands that are currently executing.  This means that an
+ * unlimited number of commands may be queued, letting
+ * more commands propagate from the higher driver levels giving higher
+ * throughput.  Note that both I_T_L and I_T_L_Q nexuses are supported,
+ * allowing multiple commands to propagate all the way to a SCSI-II device
  * while a command is already executing.
  *
  *
- * Issues specific to the NCR5380 : 
- *
- * When used in a PIO or pseudo-dma mode, the NCR5380 is a braindead 
- * piece of hardware that requires you to sit in a loop polling for 
- * the REQ signal as long as you are connected.  Some devices are 
- * brain dead (ie, many TEXEL CD ROM drives) and won't disconnect 
- * while doing long seek operations.
- * 
- * The workaround for this is to keep track of devices that have
- * disconnected.  If the device hasn't disconnected, for commands that
- * should disconnect, we do something like 
+ * Issues specific to the NCR5380 :
  *
- * while (!REQ is asserted) { sleep for N usecs; poll for M usecs }
- * 
- * Some tweaking of N and M needs to be done.  An algorithm based 
- * on "time to data" would give the best results as long as short time
- * to datas (ie, on the same track) were considered, however these 
+ * When used in a PIO or pseudo-dma mode, the NCR5380 is a braindead
+ * piece of hardware that requires you to sit in a loop polling for
+ * the REQ signal as long as you are connected.  Some devices are
+ * brain dead (ie, many TEXEL CD ROM drives) and won't disconnect
+ * while doing long seek operations. [...] These
  * broken devices are the exception rather than the rule and I'd rather
  * spend my time optimizing for the normal case.
  *
  * which is started from a workqueue for each NCR5380 host in the
  * system.  It attempts to establish I_T_L or I_T_L_Q nexuses by
  * removing the commands from the issue queue and calling
- * NCR5380_select() if a nexus is not established. 
+ * NCR5380_select() if a nexus is not established.
  *
  * Once a nexus is established, the NCR5380_information_transfer()
  * phase goes through the various phases as instructed by the target.
  * if the target goes into MSG IN and sends a DISCONNECT message,
  * the command structure is placed into the per instance disconnected
- * queue, and NCR5380_main tries to find more work.  If the target is 
+ * queue, and NCR5380_main tries to find more work.  If the target is
  * idle for too long, the system will try to sleep.
  *
  * If a command has disconnected, eventually an interrupt will trigger,
  * calling NCR5380_intr()  which will in turn call NCR5380_reselect
  * to reestablish a nexus.  This will run main if necessary.
  *
- * On command termination, the done function will be called as 
+ * On command termination, the done function will be called as
  * appropriate.
  *
- * SCSI pointers are maintained in the SCp field of SCSI command 
+ * SCSI pointers are maintained in the SCp field of SCSI command
  * structures, being initialized after the command is connected
  * in NCR5380_select, and set as appropriate in NCR5380_information_transfer.
  * Note that in violation of the standard, an implicit SAVE POINTERS operation
 /*
  * Using this file :
  * This file a skeleton Linux SCSI driver for the NCR 5380 series
- * of chips.  To use it, you write an architecture specific functions 
+ * of chips.  To use it, you write an architecture specific functions
  * and macros and include this file in your driver.
  *
- * These macros control options : 
- * AUTOPROBE_IRQ - if defined, the NCR5380_probe_irq() function will be 
- *      defined.
- * 
+ * These macros control options :
+ * AUTOPROBE_IRQ - if defined, the NCR5380_probe_irq() function will be
+ * defined.
+ *
  * AUTOSENSE - if defined, REQUEST SENSE will be performed automatically
- *      for commands that return with a CHECK CONDITION status. 
+ * for commands that return with a CHECK CONDITION status.
  *
  * DIFFERENTIAL - if defined, NCR53c81 chips will use external differential
- *      transceivers. 
+ * transceivers.
  *
  * DONT_USE_INTR - if defined, never use interrupts, even if we probe or
- *      override-configure an IRQ.
- *
- * LIMIT_TRANSFERSIZE - if defined, limit the pseudo-dma transfers to 512
- *      bytes at a time.  Since interrupts are disabled by default during
- *      these transfers, we might need this to give reasonable interrupt
- *      service time if the transfer size gets too large.
- *
- * LINKED - if defined, linked commands are supported.
+ * override-configure an IRQ.
  *
  * PSEUDO_DMA - if defined, PSEUDO DMA is used during the data transfer phases.
  *
  * REAL_DMA - if defined, REAL DMA is used during the data transfer phases.
  *
  * REAL_DMA_POLL - if defined, REAL DMA is used but the driver doesn't
- *      rely on phase mismatch and EOP interrupts to determine end 
- *      of phase.
- *
- * UNSAFE - leave interrupts enabled during pseudo-DMA transfers.  You
- *          only really want to use this if you're having a problem with
- *          dropped characters during high speed communications, and even
- *          then, you're going to be better off twiddling with transfersize
- *          in the high level code.
- *
- * Defaults for these will be provided although the user may want to adjust 
- * these to allocate CPU resources to the SCSI driver or "real" code.
- * 
- * USLEEP_SLEEP - amount of time, in jiffies, to sleep
- *
- * USLEEP_POLL - amount of time, in jiffies, to poll
+ * rely on phase mismatch and EOP interrupts to determine end
+ * of phase.
  *
  * These macros MUST be defined :
- * NCR5380_local_declare() - declare any local variables needed for your
- *      transfer routines.
  *
- * NCR5380_setup(instance) - initialize any local variables needed from a given
- *      instance of the host adapter for NCR5380_{read,write,pread,pwrite}
- * 
  * NCR5380_read(register)  - read from the specified register
  *
- * NCR5380_write(register, value) - write to the specific register 
+ * NCR5380_write(register, value) - write to the specific register
  *
- * NCR5380_implementation_fields  - additional fields needed for this 
- *      specific implementation of the NCR5380
+ * NCR5380_implementation_fields  - additional fields needed for this
+ * specific implementation of the NCR5380
  *
  * Either real DMA *or* pseudo DMA may be implemented
- * REAL functions : 
+ * REAL functions :
  * NCR5380_REAL_DMA should be defined if real DMA is to be used.
- * Note that the DMA setup functions should return the number of bytes 
- *      that they were able to program the controller for.
+ * Note that the DMA setup functions should return the number of bytes
+ * that they were able to program the controller for.
  *
- * Also note that generic i386/PC versions of these macros are 
- *      available as NCR5380_i386_dma_write_setup,
- *      NCR5380_i386_dma_read_setup, and NCR5380_i386_dma_residual.
+ * Also note that generic i386/PC versions of these macros are
+ * available as NCR5380_i386_dma_write_setup,
+ * NCR5380_i386_dma_read_setup, and NCR5380_i386_dma_residual.
  *
  * NCR5380_dma_write_setup(instance, src, count) - initialize
  * NCR5380_dma_read_setup(instance, dst, count) - initialize
  * NCR5380_pread(instance, dst, count);
  *
  * The generic driver is initialized by calling NCR5380_init(instance),
- * after setting the appropriate host specific fields and ID.  If the 
+ * after setting the appropriate host specific fields and ID.  If the
  * driver wishes to autoprobe for an IRQ line, the NCR5380_probe_irq(instance,
  * possible) function may be used.
  */
 
-static int do_abort(struct Scsi_Host *host);
-static void do_reset(struct Scsi_Host *host);
+static int do_abort(struct Scsi_Host *);
+static void do_reset(struct Scsi_Host *);
 
-/*
- *     initialize_SCp          -       init the scsi pointer field
- *     @cmd: command block to set up
+/**
+ * initialize_SCp - init the scsi pointer field
+ * @cmd: command block to set up
  *
- *     Set up the internal fields in the SCSI command.
+ * Set up the internal fields in the SCSI command.
  */
 
 static inline void initialize_SCp(struct scsi_cmnd *cmd)
 {
-       /* 
-        * Initialize the Scsi Pointer field so that all of the commands in the 
+       /*
+        * Initialize the Scsi Pointer field so that all of the commands in the
         * various queues are valid.
         */
 
@@ -295,120 +198,123 @@ static inline void initialize_SCp(struct scsi_cmnd *cmd)
                cmd->SCp.ptr = NULL;
                cmd->SCp.this_residual = 0;
        }
+
+       cmd->SCp.Status = 0;
+       cmd->SCp.Message = 0;
 }
 
 /**
- *     NCR5380_poll_politely   -       wait for NCR5380 status bits
- *     @instance: controller to poll
- *     @reg: 5380 register to poll
- *     @bit: Bitmask to check
- *     @val: Value required to exit
- *
- *     Polls the NCR5380 in a reasonably efficient manner waiting for
- *     an event to occur, after a short quick poll we begin giving the
- *     CPU back in non IRQ contexts
- *
- *     Returns the value of the register or a negative error code.
+ * NCR5380_poll_politely2 - wait for two chip register values
+ * @instance: controller to poll
+ * @reg1: 5380 register to poll
+ * @bit1: Bitmask to check
+ * @val1: Expected value
+ * @reg2: Second 5380 register to poll
+ * @bit2: Second bitmask to check
+ * @val2: Second expected value
+ * @wait: Time-out in jiffies
+ *
+ * Polls the chip in a reasonably efficient manner waiting for an
+ * event to occur. After a short quick poll we begin to yield the CPU
+ * (if possible). In irq contexts the time-out is arbitrarily limited.
+ * Callers may hold locks as long as they are held in irq mode.
+ *
+ * Returns 0 if either or both event(s) occurred otherwise -ETIMEDOUT.
  */
-static int NCR5380_poll_politely(struct Scsi_Host *instance, int reg, int bit, int val, int t)
+
+static int NCR5380_poll_politely2(struct Scsi_Host *instance,
+                                  int reg1, int bit1, int val1,
+                                  int reg2, int bit2, int val2, int wait)
 {
-       NCR5380_local_declare();
-       int n = 500;            /* At about 8uS a cycle for the cpu access */
-       unsigned long end = jiffies + t;
-       int r;
-       
-       NCR5380_setup(instance);
-
-       while( n-- > 0)
-       {
-               r = NCR5380_read(reg);
-               if((r & bit) == val)
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
+       unsigned long deadline = jiffies + wait;
+       unsigned long n;
+
+       /* Busy-wait for up to 10 ms */
+       n = min(10000U, jiffies_to_usecs(wait));
+       n *= hostdata->accesses_per_ms;
+       n /= 2000;
+       do {
+               if ((NCR5380_read(reg1) & bit1) == val1)
+                       return 0;
+               if ((NCR5380_read(reg2) & bit2) == val2)
                        return 0;
                cpu_relax();
-       }
-       
-       /* t time yet ? */
-       while(time_before(jiffies, end))
-       {
-               r = NCR5380_read(reg);
-               if((r & bit) == val)
+       } while (n--);
+
+       if (irqs_disabled() || in_interrupt())
+               return -ETIMEDOUT;
+
+       /* Repeatedly sleep for 1 ms until deadline */
+       while (time_is_after_jiffies(deadline)) {
+               schedule_timeout_uninterruptible(1);
+               if ((NCR5380_read(reg1) & bit1) == val1)
+                       return 0;
+               if ((NCR5380_read(reg2) & bit2) == val2)
                        return 0;
-               if(!in_interrupt())
-                       cond_resched();
-               else
-                       cpu_relax();
        }
+
        return -ETIMEDOUT;
 }
 
-static struct {
-       unsigned char value;
-       const char *name;
-} phases[] __maybe_unused = {
-       {PHASE_DATAOUT, "DATAOUT"}, 
-       {PHASE_DATAIN, "DATAIN"}, 
-       {PHASE_CMDOUT, "CMDOUT"}, 
-       {PHASE_STATIN, "STATIN"}, 
-       {PHASE_MSGOUT, "MSGOUT"}, 
-       {PHASE_MSGIN, "MSGIN"}, 
-       {PHASE_UNKNOWN, "UNKNOWN"}
-};
+static inline int NCR5380_poll_politely(struct Scsi_Host *instance,
+                                        int reg, int bit, int val, int wait)
+{
+       return NCR5380_poll_politely2(instance, reg, bit, val,
+                                               reg, bit, val, wait);
+}
 
 #if NDEBUG
 static struct {
        unsigned char mask;
        const char *name;
-} signals[] = { 
-       {SR_DBP, "PARITY"}, 
-       {SR_RST, "RST"}, 
-       {SR_BSY, "BSY"}, 
-       {SR_REQ, "REQ"}, 
-       {SR_MSG, "MSG"}, 
-       {SR_CD, "CD"}, 
-       {SR_IO, "IO"}, 
-       {SR_SEL, "SEL"}, 
+} signals[] = {
+       {SR_DBP, "PARITY"},
+       {SR_RST, "RST"},
+       {SR_BSY, "BSY"},
+       {SR_REQ, "REQ"},
+       {SR_MSG, "MSG"},
+       {SR_CD, "CD"},
+       {SR_IO, "IO"},
+       {SR_SEL, "SEL"},
        {0, NULL}
-}, 
+},
 basrs[] = {
-       {BASR_ATN, "ATN"}, 
-       {BASR_ACK, "ACK"}, 
+       {BASR_ATN, "ATN"},
+       {BASR_ACK, "ACK"},
        {0, NULL}
-}, 
-icrs[] = { 
-       {ICR_ASSERT_RST, "ASSERT RST"}, 
-       {ICR_ASSERT_ACK, "ASSERT ACK"}, 
-       {ICR_ASSERT_BSY, "ASSERT BSY"}, 
-       {ICR_ASSERT_SEL, "ASSERT SEL"}, 
-       {ICR_ASSERT_ATN, "ASSERT ATN"}, 
-       {ICR_ASSERT_DATA, "ASSERT DATA"}, 
+},
+icrs[] = {
+       {ICR_ASSERT_RST, "ASSERT RST"},
+       {ICR_ASSERT_ACK, "ASSERT ACK"},
+       {ICR_ASSERT_BSY, "ASSERT BSY"},
+       {ICR_ASSERT_SEL, "ASSERT SEL"},
+       {ICR_ASSERT_ATN, "ASSERT ATN"},
+       {ICR_ASSERT_DATA, "ASSERT DATA"},
        {0, NULL}
-}, 
-mrs[] = { 
-       {MR_BLOCK_DMA_MODE, "MODE BLOCK DMA"}, 
-       {MR_TARGET, "MODE TARGET"}, 
-       {MR_ENABLE_PAR_CHECK, "MODE PARITY CHECK"}, 
-       {MR_ENABLE_PAR_INTR, "MODE PARITY INTR"}, 
-       {MR_MONITOR_BSY, "MODE MONITOR BSY"}, 
-       {MR_DMA_MODE, "MODE DMA"}, 
-       {MR_ARBITRATE, "MODE ARBITRATION"}, 
+},
+mrs[] = {
+       {MR_BLOCK_DMA_MODE, "MODE BLOCK DMA"},
+       {MR_TARGET, "MODE TARGET"},
+       {MR_ENABLE_PAR_CHECK, "MODE PARITY CHECK"},
+       {MR_ENABLE_PAR_INTR, "MODE PARITY INTR"},
+       {MR_ENABLE_EOP_INTR, "MODE EOP INTR"},
+       {MR_MONITOR_BSY, "MODE MONITOR BSY"},
+       {MR_DMA_MODE, "MODE DMA"},
+       {MR_ARBITRATE, "MODE ARBITRATION"},
        {0, NULL}
 };
 
 /**
- *     NCR5380_print   -       print scsi bus signals
- *     @instance:      adapter state to dump
- *
- *     Print the SCSI bus signals for debugging purposes
+ * NCR5380_print - print scsi bus signals
+ * @instance: adapter state to dump
  *
- *     Locks: caller holds hostdata lock (not essential)
+ * Print the SCSI bus signals for debugging purposes
  */
 
 static void NCR5380_print(struct Scsi_Host *instance)
 {
-       NCR5380_local_declare();
        unsigned char status, data, basr, mr, icr, i;
-       NCR5380_setup(instance);
 
        data = NCR5380_read(CURRENT_SCSI_DATA_REG);
        status = NCR5380_read(STATUS_REG);
@@ -435,117 +341,56 @@ static void NCR5380_print(struct Scsi_Host *instance)
        printk("\n");
 }
 
+static struct {
+       unsigned char value;
+       const char *name;
+} phases[] = {
+       {PHASE_DATAOUT, "DATAOUT"},
+       {PHASE_DATAIN, "DATAIN"},
+       {PHASE_CMDOUT, "CMDOUT"},
+       {PHASE_STATIN, "STATIN"},
+       {PHASE_MSGOUT, "MSGOUT"},
+       {PHASE_MSGIN, "MSGIN"},
+       {PHASE_UNKNOWN, "UNKNOWN"}
+};
 
-/* 
- *     NCR5380_print_phase     -       show SCSI phase
- *     @instance: adapter to dump
- *
- *     Print the current SCSI phase for debugging purposes
+/**
+ * NCR5380_print_phase - show SCSI phase
+ * @instance: adapter to dump
  *
- *     Locks: none
+ * Print the current SCSI phase for debugging purposes
  */
 
 static void NCR5380_print_phase(struct Scsi_Host *instance)
 {
-       NCR5380_local_declare();
        unsigned char status;
        int i;
-       NCR5380_setup(instance);
 
        status = NCR5380_read(STATUS_REG);
        if (!(status & SR_REQ))
-               printk("scsi%d : REQ not asserted, phase unknown.\n", instance->host_no);
+               shost_printk(KERN_DEBUG, instance, "REQ not asserted, phase unknown.\n");
        else {
-               for (i = 0; (phases[i].value != PHASE_UNKNOWN) && (phases[i].value != (status & PHASE_MASK)); ++i);
-               printk("scsi%d : phase %s\n", instance->host_no, phases[i].name);
+               for (i = 0; (phases[i].value != PHASE_UNKNOWN) &&
+                    (phases[i].value != (status & PHASE_MASK)); ++i)
+                       ;
+               shost_printk(KERN_DEBUG, instance, "phase %s\n", phases[i].name);
        }
 }
 #endif
 
-/*
- * These need tweaking, and would probably work best as per-device 
- * flags initialized differently for disk, tape, cd, etc devices.
- * People with broken devices are free to experiment as to what gives
- * the best results for them.
- *
- * USLEEP_SLEEP should be a minimum seek time.
- *
- * USLEEP_POLL should be a maximum rotational latency.
- */
-#ifndef USLEEP_SLEEP
-/* 20 ms (reasonable hard disk speed) */
-#define USLEEP_SLEEP msecs_to_jiffies(20)
-#endif
-/* 300 RPM (floppy speed) */
-#ifndef USLEEP_POLL
-#define USLEEP_POLL msecs_to_jiffies(200)
-#endif
-#ifndef USLEEP_WAITLONG
-/* RvC: (reasonable time to wait on select error) */
-#define USLEEP_WAITLONG USLEEP_SLEEP
-#endif
-
-/* 
- * Function : int should_disconnect (unsigned char cmd)
- *
- * Purpose : decide whether a command would normally disconnect or 
- *      not, since if it won't disconnect we should go to sleep.
- *
- * Input : cmd - opcode of SCSI command
- *
- * Returns : DISCONNECT_LONG if we should disconnect for a really long 
- *      time (ie always, sleep, look for REQ active, sleep), 
- *      DISCONNECT_TIME_TO_DATA if we would only disconnect for a normal
- *      time-to-data delay, DISCONNECT_NONE if this command would return
- *      immediately.
- *
- *      Future sleep algorithms based on time to data can exploit 
- *      something like this so they can differentiate between "normal" 
- *      (ie, read, write, seek) and unusual commands (ie, * format).
- *
- * Note : We don't deal with commands that handle an immediate disconnect,
- *        
- */
 
-static int should_disconnect(unsigned char cmd)
-{
-       switch (cmd) {
-       case READ_6:
-       case WRITE_6:
-       case SEEK_6:
-       case READ_10:
-       case WRITE_10:
-       case SEEK_10:
-               return DISCONNECT_TIME_TO_DATA;
-       case FORMAT_UNIT:
-       case SEARCH_HIGH:
-       case SEARCH_LOW:
-       case SEARCH_EQUAL:
-               return DISCONNECT_LONG;
-       default:
-               return DISCONNECT_NONE;
-       }
-}
-
-static void NCR5380_set_timer(struct NCR5380_hostdata *hostdata, unsigned long timeout)
-{
-       hostdata->time_expires = jiffies + timeout;
-       schedule_delayed_work(&hostdata->coroutine, timeout);
-}
-
-
-static int probe_irq __initdata = 0;
+static int probe_irq __initdata;
 
 /**
- *     probe_intr      -       helper for IRQ autoprobe
- *     @irq: interrupt number
- *     @dev_id: unused
- *     @regs: unused
+ * probe_intr  -       helper for IRQ autoprobe
+ * @irq: interrupt number
+ * @dev_id: unused
+ * @regs: unused
  *
- *     Set a flag to indicate the IRQ in question was received. This is
- *     used by the IRQ probe code.
+ * Set a flag to indicate the IRQ in question was received. This is
+ * used by the IRQ probe code.
  */
+
 static irqreturn_t __init probe_intr(int irq, void *dev_id)
 {
        probe_irq = irq;
@@ -553,24 +398,20 @@ static irqreturn_t __init probe_intr(int irq, void *dev_id)
 }
 
 /**
- *     NCR5380_probe_irq       -       find the IRQ of an NCR5380
- *     @instance: NCR5380 controller
- *     @possible: bitmask of ISA IRQ lines
- *
- *     Autoprobe for the IRQ line used by the NCR5380 by triggering an IRQ
- *     and then looking to see what interrupt actually turned up.
+ * NCR5380_probe_irq   -       find the IRQ of an NCR5380
+ * @instance: NCR5380 controller
+ * @possible: bitmask of ISA IRQ lines
  *
- *     Locks: none, irqs must be enabled on entry
+ * Autoprobe for the IRQ line used by the NCR5380 by triggering an IRQ
+ * and then looking to see what interrupt actually turned up.
  */
 
 static int __init __maybe_unused NCR5380_probe_irq(struct Scsi_Host *instance,
                                                int possible)
 {
-       NCR5380_local_declare();
-       struct NCR5380_hostdata *hostdata = (struct NCR5380_hostdata *) instance->hostdata;
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
        unsigned long timeout;
        int trying_irqs, i, mask;
-       NCR5380_setup(instance);
 
        for (trying_irqs = 0, i = 1, mask = 2; i < 16; ++i, mask <<= 1)
                if ((mask & possible) && (request_irq(i, &probe_intr, 0, "NCR-probe", NULL) == 0))
@@ -581,7 +422,7 @@ static int __init __maybe_unused NCR5380_probe_irq(struct Scsi_Host *instance,
 
        /*
         * A interrupt is triggered whenever BSY = false, SEL = true
-        * and a bit set in the SELECT_ENABLE_REG is asserted on the 
+        * and a bit set in the SELECT_ENABLE_REG is asserted on the
         * SCSI bus.
         *
         * Note that the bus is only driven when the phase control signals
@@ -596,7 +437,7 @@ static int __init __maybe_unused NCR5380_probe_irq(struct Scsi_Host *instance,
 
        while (probe_irq == NO_IRQ && time_before(jiffies, timeout))
                schedule_timeout_uninterruptible(1);
-       
+
        NCR5380_write(SELECT_ENABLE_REG, 0);
        NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
 
@@ -608,12 +449,10 @@ static int __init __maybe_unused NCR5380_probe_irq(struct Scsi_Host *instance,
 }
 
 /**
- *     NCR58380_info - report driver and host information
- *     @instance: relevant scsi host instance
- *
- *     For use as the host template info() handler.
+ * NCR58380_info - report driver and host information
+ * @instance: relevant scsi host instance
  *
- *     Locks: none
+ * For use as the host template info() handler.
  */
 
 static const char *NCR5380_info(struct Scsi_Host *instance)
@@ -633,20 +472,14 @@ static void prepare_info(struct Scsi_Host *instance)
                 "can_queue %d, cmd_per_lun %d, "
                 "sg_tablesize %d, this_id %d, "
                 "flags { %s%s%s}, "
-#if defined(USLEEP_POLL) && defined(USLEEP_WAITLONG)
-                "USLEEP_POLL %lu, USLEEP_WAITLONG %lu, "
-#endif
                 "options { %s} ",
                 instance->hostt->name, instance->io_port, instance->n_io_port,
                 instance->base, instance->irq,
                 instance->can_queue, instance->cmd_per_lun,
                 instance->sg_tablesize, instance->this_id,
-                hostdata->flags & FLAG_NCR53C400     ? "NCR53C400 "     : "",
-                hostdata->flags & FLAG_DTC3181E      ? "DTC3181E "      : "",
+                hostdata->flags & FLAG_NO_DMA_FIXUP  ? "NO_DMA_FIXUP "  : "",
                 hostdata->flags & FLAG_NO_PSEUDO_DMA ? "NO_PSEUDO_DMA " : "",
-#if defined(USLEEP_POLL) && defined(USLEEP_WAITLONG)
-                USLEEP_POLL, USLEEP_WAITLONG,
-#endif
+                hostdata->flags & FLAG_TOSHIBA_DELAY ? "TOSHIBA_DELAY "  : "",
 #ifdef AUTOPROBE_IRQ
                 "AUTOPROBE_IRQ "
 #endif
@@ -664,47 +497,11 @@ static void prepare_info(struct Scsi_Host *instance)
 #endif
 #ifdef PSEUDO_DMA
                 "PSEUDO_DMA "
-#endif
-#ifdef UNSAFE
-                "UNSAFE "
-#endif
-#ifdef NCR53C400
-                "NCR53C400 "
 #endif
                 "");
 }
 
-/**
- *     NCR5380_print_status    -       dump controller info
- *     @instance: controller to dump
- *
- *     Print commands in the various queues, called from NCR5380_abort 
- *     and NCR5380_debug to aid debugging.
- *
- *     Locks: called functions disable irqs
- */
-
-static void NCR5380_print_status(struct Scsi_Host *instance)
-{
-       NCR5380_dprint(NDEBUG_ANY, instance);
-       NCR5380_dprint_phase(NDEBUG_ANY, instance);
-}
-
 #ifdef PSEUDO_DMA
-/******************************************/
-/*
- * /proc/scsi/[dtc pas16 t128 generic]/[0-ASC_NUM_BOARD_SUPPORTED]
- *
- * *buffer: I/O buffer
- * **start: if inout == FALSE pointer into buffer where user read should start
- * offset: current offset
- * length: length of buffer
- * hostno: Scsi_Host host_no
- * inout: TRUE - user is writing; FALSE - user is reading
- *
- * Return the number of bytes read from or written
- */
-
 static int __maybe_unused NCR5380_write_info(struct Scsi_Host *instance,
        char *buffer, int length)
 {
@@ -714,104 +511,41 @@ static int __maybe_unused NCR5380_write_info(struct Scsi_Host *instance,
        hostdata->spin_max_w = 0;
        return 0;
 }
-#endif
-
-static
-void lprint_Scsi_Cmnd(struct scsi_cmnd *cmd, struct seq_file *m);
-static
-void lprint_command(unsigned char *cmd, struct seq_file *m);
-static
-void lprint_opcode(int opcode, struct seq_file *m);
 
 static int __maybe_unused NCR5380_show_info(struct seq_file *m,
-       struct Scsi_Host *instance)
+                                            struct Scsi_Host *instance)
 {
-       struct NCR5380_hostdata *hostdata;
-       struct scsi_cmnd *ptr;
-
-       hostdata = (struct NCR5380_hostdata *) instance->hostdata;
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
 
-#ifdef PSEUDO_DMA
        seq_printf(m, "Highwater I/O busy spin counts: write %d, read %d\n",
                hostdata->spin_max_w, hostdata->spin_max_r);
-#endif
-       spin_lock_irq(instance->host_lock);
-       if (!hostdata->connected)
-               seq_printf(m, "scsi%d: no currently connected command\n", instance->host_no);
-       else
-               lprint_Scsi_Cmnd((struct scsi_cmnd *) hostdata->connected, m);
-       seq_printf(m, "scsi%d: issue_queue\n", instance->host_no);
-       for (ptr = (struct scsi_cmnd *) hostdata->issue_queue; ptr; ptr = (struct scsi_cmnd *) ptr->host_scribble)
-               lprint_Scsi_Cmnd(ptr, m);
-
-       seq_printf(m, "scsi%d: disconnected_queue\n", instance->host_no);
-       for (ptr = (struct scsi_cmnd *) hostdata->disconnected_queue; ptr; ptr = (struct scsi_cmnd *) ptr->host_scribble)
-               lprint_Scsi_Cmnd(ptr, m);
-       spin_unlock_irq(instance->host_lock);
        return 0;
 }
-
-static void lprint_Scsi_Cmnd(struct scsi_cmnd *cmd, struct seq_file *m)
-{
-       seq_printf(m, "scsi%d : destination target %d, lun %llu\n", cmd->device->host->host_no, cmd->device->id, cmd->device->lun);
-       seq_puts(m, "        command = ");
-       lprint_command(cmd->cmnd, m);
-}
-
-static void lprint_command(unsigned char *command, struct seq_file *m)
-{
-       int i, s;
-       lprint_opcode(command[0], m);
-       for (i = 1, s = COMMAND_SIZE(command[0]); i < s; ++i)
-               seq_printf(m, "%02x ", command[i]);
-       seq_putc(m, '\n');
-}
-
-static void lprint_opcode(int opcode, struct seq_file *m)
-{
-       seq_printf(m, "%2d (0x%02x)", opcode, opcode);
-}
-
+#endif
 
 /**
- *     NCR5380_init    -       initialise an NCR5380
- *     @instance: adapter to configure
- *     @flags: control flags
+ * NCR5380_init - initialise an NCR5380
+ * @instance: adapter to configure
+ * @flags: control flags
  *
- *     Initializes *instance and corresponding 5380 chip,
- *      with flags OR'd into the initial flags value.
+ * Initializes *instance and corresponding 5380 chip,
+ * with flags OR'd into the initial flags value.
  *
- *     Notes : I assume that the host, hostno, and id bits have been
- *      set correctly.  I don't care about the irq and other fields. 
+ * Notes : I assume that the host, hostno, and id bits have been
+ * set correctly. I don't care about the irq and other fields.
  *
- *     Returns 0 for success
- *
- *     Locks: interrupts must be enabled when we are called 
+ * Returns 0 for success
  */
 
 static int NCR5380_init(struct Scsi_Host *instance, int flags)
 {
-       NCR5380_local_declare();
-       int i, pass;
-       unsigned long timeout;
-       struct NCR5380_hostdata *hostdata = (struct NCR5380_hostdata *) instance->hostdata;
-
-       if(in_interrupt())
-               printk(KERN_ERR "NCR5380_init called with interrupts off!\n");
-       /* 
-        * On NCR53C400 boards, NCR5380 registers are mapped 8 past 
-        * the base address.
-        */
-
-#ifdef NCR53C400
-       if (flags & FLAG_NCR53C400)
-               instance->NCR5380_instance_name += NCR53C400_address_adjust;
-#endif
-
-       NCR5380_setup(instance);
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
+       int i;
+       unsigned long deadline;
 
-       hostdata->aborted = 0;
+       hostdata->host = instance;
        hostdata->id_mask = 1 << instance->this_id;
+       hostdata->id_higher_mask = 0;
        for (i = hostdata->id_mask; i <= 0x80; i <<= 1)
                if (i > hostdata->id_mask)
                        hostdata->id_higher_mask |= i;
@@ -820,21 +554,21 @@ static int NCR5380_init(struct Scsi_Host *instance, int flags)
 #ifdef REAL_DMA
        hostdata->dmalen = 0;
 #endif
-       hostdata->targets_present = 0;
+       spin_lock_init(&hostdata->lock);
        hostdata->connected = NULL;
-       hostdata->issue_queue = NULL;
-       hostdata->disconnected_queue = NULL;
-       
-       INIT_DELAYED_WORK(&hostdata->coroutine, NCR5380_main);
-       
-       /* The CHECK code seems to break the 53C400. Will check it later maybe */
-       if (flags & FLAG_NCR53C400)
-               hostdata->flags = FLAG_HAS_LAST_BYTE_SENT | flags;
-       else
-               hostdata->flags = FLAG_CHECK_LAST_BYTE_SENT | flags;
+       hostdata->sensing = NULL;
+       INIT_LIST_HEAD(&hostdata->autosense);
+       INIT_LIST_HEAD(&hostdata->unissued);
+       INIT_LIST_HEAD(&hostdata->disconnected);
 
-       hostdata->host = instance;
-       hostdata->time_expires = 0;
+       hostdata->flags = flags;
+
+       INIT_WORK(&hostdata->main_task, NCR5380_main);
+       hostdata->work_q = alloc_workqueue("ncr5380_%d",
+                               WQ_UNBOUND | WQ_MEM_RECLAIM,
+                               1, instance->host_no);
+       if (!hostdata->work_q)
+               return -ENOMEM;
 
        prepare_info(instance);
 
@@ -843,43 +577,69 @@ static int NCR5380_init(struct Scsi_Host *instance, int flags)
        NCR5380_write(TARGET_COMMAND_REG, 0);
        NCR5380_write(SELECT_ENABLE_REG, 0);
 
-#ifdef NCR53C400
-       if (hostdata->flags & FLAG_NCR53C400) {
-               NCR5380_write(C400_CONTROL_STATUS_REG, CSR_BASE);
-       }
-#endif
+       /* Calibrate register polling loop */
+       i = 0;
+       deadline = jiffies + 1;
+       do {
+               cpu_relax();
+       } while (time_is_after_jiffies(deadline));
+       deadline += msecs_to_jiffies(256);
+       do {
+               NCR5380_read(STATUS_REG);
+               ++i;
+               cpu_relax();
+       } while (time_is_after_jiffies(deadline));
+       hostdata->accesses_per_ms = i / 256;
 
-       /* 
-        * Detect and correct bus wedge problems.
-        *
-        * If the system crashed, it may have crashed in a state 
-        * where a SCSI command was still executing, and the 
-        * SCSI bus is not in a BUS FREE STATE.
-        *
-        * If this is the case, we'll try to abort the currently
-        * established nexus which we know nothing about, and that
-        * failing, do a hard reset of the SCSI bus 
-        */
+       return 0;
+}
+
+/**
+ * NCR5380_maybe_reset_bus - Detect and correct bus wedge problems.
+ * @instance: adapter to check
+ *
+ * If the system crashed, it may have crashed with a connected target and
+ * the SCSI bus busy. Check for BUS FREE phase. If not, try to abort the
+ * currently established nexus, which we know nothing about. Failing that
+ * do a bus reset.
+ *
+ * Note that a bus reset will cause the chip to assert IRQ.
+ *
+ * Returns 0 if successful, otherwise -ENXIO.
+ */
+
+static int NCR5380_maybe_reset_bus(struct Scsi_Host *instance)
+{
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
+       int pass;
 
        for (pass = 1; (NCR5380_read(STATUS_REG) & SR_BSY) && pass <= 6; ++pass) {
                switch (pass) {
                case 1:
                case 3:
                case 5:
-                       printk(KERN_INFO "scsi%d: SCSI bus busy, waiting up to five seconds\n", instance->host_no);
-                       timeout = jiffies + 5 * HZ;
-                       NCR5380_poll_politely(instance, STATUS_REG, SR_BSY, 0, 5*HZ);
+                       shost_printk(KERN_ERR, instance, "SCSI bus busy, waiting up to five seconds\n");
+                       NCR5380_poll_politely(instance,
+                                             STATUS_REG, SR_BSY, 0, 5 * HZ);
                        break;
                case 2:
-                       printk(KERN_WARNING "scsi%d: bus busy, attempting abort\n", instance->host_no);
+                       shost_printk(KERN_ERR, instance, "bus busy, attempting abort\n");
                        do_abort(instance);
                        break;
                case 4:
-                       printk(KERN_WARNING "scsi%d: bus busy, attempting reset\n", instance->host_no);
+                       shost_printk(KERN_ERR, instance, "bus busy, attempting reset\n");
                        do_reset(instance);
+                       /* Wait after a reset; the SCSI standard calls for
+                        * 250ms, we wait 500ms to be on the safe side.
+                        * But some Toshiba CD-ROMs need ten times that.
+                        */
+                       if (hostdata->flags & FLAG_TOSHIBA_DELAY)
+                               msleep(2500);
+                       else
+                               msleep(500);
                        break;
                case 6:
-                       printk(KERN_ERR "scsi%d: bus locked solid or invalid override\n", instance->host_no);
+                       shost_printk(KERN_ERR, instance, "bus locked solid\n");
                        return -ENXIO;
                }
        }
@@ -887,450 +647,513 @@ static int NCR5380_init(struct Scsi_Host *instance, int flags)
 }
 
 /**
- *     NCR5380_exit    -       remove an NCR5380
- *     @instance: adapter to remove
+ * NCR5380_exit - remove an NCR5380
+ * @instance: adapter to remove
+ *
+ * Assumes that no more work can be queued (e.g. by NCR5380_intr).
  */
 
 static void NCR5380_exit(struct Scsi_Host *instance)
 {
-       struct NCR5380_hostdata *hostdata = (struct NCR5380_hostdata *) instance->hostdata;
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
 
-       cancel_delayed_work_sync(&hostdata->coroutine);
+       cancel_work_sync(&hostdata->main_task);
+       destroy_workqueue(hostdata->work_q);
 }
 
 /**
- *     NCR5380_queue_command           -       queue a command
- *     @cmd: SCSI command
- *     @done: completion handler
- *
- *      cmd is added to the per instance issue_queue, with minor 
- *      twiddling done to the host specific fields of cmd.  If the 
- *      main coroutine is not running, it is restarted.
+ * complete_cmd - finish processing a command and return it to the SCSI ML
+ * @instance: the host instance
+ * @cmd: command to complete
+ */
+
+static void complete_cmd(struct Scsi_Host *instance,
+                         struct scsi_cmnd *cmd)
+{
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
+
+       dsprintk(NDEBUG_QUEUES, instance, "complete_cmd: cmd %p\n", cmd);
+
+       if (hostdata->sensing == cmd) {
+               /* Autosense processing ends here */
+               if ((cmd->result & 0xff) != SAM_STAT_GOOD) {
+                       scsi_eh_restore_cmnd(cmd, &hostdata->ses);
+                       set_host_byte(cmd, DID_ERROR);
+               } else
+                       scsi_eh_restore_cmnd(cmd, &hostdata->ses);
+               hostdata->sensing = NULL;
+       }
+
+       hostdata->busy[scmd_id(cmd)] &= ~(1 << cmd->device->lun);
+
+       cmd->scsi_done(cmd);
+}
+
+/**
+ * NCR5380_queue_command - queue a command
+ * @instance: the relevant SCSI adapter
+ * @cmd: SCSI command
  *
- *     Locks: host lock taken by caller
+ * cmd is added to the per-instance issue queue, with minor
+ * twiddling done to the host specific fields of cmd.  If the
+ * main coroutine is not running, it is restarted.
  */
 
-static int NCR5380_queue_command_lck(struct scsi_cmnd *cmd, void (*done) (struct scsi_cmnd *))
+static int NCR5380_queue_command(struct Scsi_Host *instance,
+                                 struct scsi_cmnd *cmd)
 {
-       struct Scsi_Host *instance = cmd->device->host;
-       struct NCR5380_hostdata *hostdata = (struct NCR5380_hostdata *) instance->hostdata;
-       struct scsi_cmnd *tmp;
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
+       struct NCR5380_cmd *ncmd = scsi_cmd_priv(cmd);
+       unsigned long flags;
 
 #if (NDEBUG & NDEBUG_NO_WRITE)
        switch (cmd->cmnd[0]) {
        case WRITE_6:
        case WRITE_10:
-               printk("scsi%d : WRITE attempted with NO_WRITE debugging flag set\n", instance->host_no);
+               shost_printk(KERN_DEBUG, instance, "WRITE attempted with NDEBUG_NO_WRITE set\n");
                cmd->result = (DID_ERROR << 16);
-               done(cmd);
+               cmd->scsi_done(cmd);
                return 0;
        }
-#endif                         /* (NDEBUG & NDEBUG_NO_WRITE) */
+#endif /* (NDEBUG & NDEBUG_NO_WRITE) */
 
-       /* 
-        * We use the host_scribble field as a pointer to the next command  
-        * in a queue 
-        */
-
-       cmd->host_scribble = NULL;
-       cmd->scsi_done = done;
        cmd->result = 0;
 
-       /* 
-        * Insert the cmd into the issue queue. Note that REQUEST SENSE 
+       spin_lock_irqsave(&hostdata->lock, flags);
+
+       /*
+        * Insert the cmd into the issue queue. Note that REQUEST SENSE
         * commands are added to the head of the queue since any command will
-        * clear the contingent allegiance condition that exists and the 
+        * clear the contingent allegiance condition that exists and the
         * sense data is only guaranteed to be valid while the condition exists.
         */
 
-       if (!(hostdata->issue_queue) || (cmd->cmnd[0] == REQUEST_SENSE)) {
-               LIST(cmd, hostdata->issue_queue);
-               cmd->host_scribble = (unsigned char *) hostdata->issue_queue;
-               hostdata->issue_queue = cmd;
-       } else {
-               for (tmp = (struct scsi_cmnd *) hostdata->issue_queue; tmp->host_scribble; tmp = (struct scsi_cmnd *) tmp->host_scribble);
-               LIST(cmd, tmp);
-               tmp->host_scribble = (unsigned char *) cmd;
-       }
-       dprintk(NDEBUG_QUEUES, "scsi%d : command added to %s of queue\n", instance->host_no, (cmd->cmnd[0] == REQUEST_SENSE) ? "head" : "tail");
+       if (cmd->cmnd[0] == REQUEST_SENSE)
+               list_add(&ncmd->list, &hostdata->unissued);
+       else
+               list_add_tail(&ncmd->list, &hostdata->unissued);
+
+       spin_unlock_irqrestore(&hostdata->lock, flags);
+
+       dsprintk(NDEBUG_QUEUES, instance, "command %p added to %s of queue\n",
+                cmd, (cmd->cmnd[0] == REQUEST_SENSE) ? "head" : "tail");
 
-       /* Run the coroutine if it isn't already running. */
        /* Kick off command processing */
-       schedule_delayed_work(&hostdata->coroutine, 0);
+       queue_work(hostdata->work_q, &hostdata->main_task);
        return 0;
 }
 
-static DEF_SCSI_QCMD(NCR5380_queue_command)
+/**
+ * dequeue_next_cmd - dequeue a command for processing
+ * @instance: the scsi host instance
+ *
+ * Priority is given to commands on the autosense queue. These commands
+ * need autosense because of a CHECK CONDITION result.
+ *
+ * Returns a command pointer if a command is found for a target that is
+ * not already busy. Otherwise returns NULL.
+ */
+
+static struct scsi_cmnd *dequeue_next_cmd(struct Scsi_Host *instance)
+{
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
+       struct NCR5380_cmd *ncmd;
+       struct scsi_cmnd *cmd;
+
+       if (list_empty(&hostdata->autosense)) {
+               list_for_each_entry(ncmd, &hostdata->unissued, list) {
+                       cmd = NCR5380_to_scmd(ncmd);
+                       dsprintk(NDEBUG_QUEUES, instance, "dequeue: cmd=%p target=%d busy=0x%02x lun=%llu\n",
+                                cmd, scmd_id(cmd), hostdata->busy[scmd_id(cmd)], cmd->device->lun);
+
+                       if (!(hostdata->busy[scmd_id(cmd)] & (1 << cmd->device->lun))) {
+                               list_del(&ncmd->list);
+                               dsprintk(NDEBUG_QUEUES, instance,
+                                        "dequeue: removed %p from issue queue\n", cmd);
+                               return cmd;
+                       }
+               }
+       } else {
+               /* Autosense processing begins here */
+               ncmd = list_first_entry(&hostdata->autosense,
+                                       struct NCR5380_cmd, list);
+               list_del(&ncmd->list);
+               cmd = NCR5380_to_scmd(ncmd);
+               dsprintk(NDEBUG_QUEUES, instance,
+                        "dequeue: removed %p from autosense queue\n", cmd);
+               scsi_eh_prep_cmnd(cmd, &hostdata->ses, NULL, 0, ~0);
+               hostdata->sensing = cmd;
+               return cmd;
+       }
+       return NULL;
+}
+
+static void requeue_cmd(struct Scsi_Host *instance, struct scsi_cmnd *cmd)
+{
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
+       struct NCR5380_cmd *ncmd = scsi_cmd_priv(cmd);
+
+       if (hostdata->sensing) {
+               scsi_eh_restore_cmnd(cmd, &hostdata->ses);
+               list_add(&ncmd->list, &hostdata->autosense);
+               hostdata->sensing = NULL;
+       } else
+               list_add(&ncmd->list, &hostdata->unissued);
+}
 
 /**
- *     NCR5380_main    -       NCR state machines
- *
- *     NCR5380_main is a coroutine that runs as long as more work can 
- *      be done on the NCR5380 host adapters in a system.  Both 
- *      NCR5380_queue_command() and NCR5380_intr() will try to start it 
- *      in case it is not running.
- * 
- *     Locks: called as its own thread with no locks held. Takes the
- *     host lock and called routines may take the isa dma lock.
+ * NCR5380_main - NCR state machines
+ *
+ * NCR5380_main is a coroutine that runs as long as more work can
+ * be done on the NCR5380 host adapters in a system.  Both
+ * NCR5380_queue_command() and NCR5380_intr() will try to start it
+ * in case it is not running.
  */
 
 static void NCR5380_main(struct work_struct *work)
 {
        struct NCR5380_hostdata *hostdata =
-               container_of(work, struct NCR5380_hostdata, coroutine.work);
+               container_of(work, struct NCR5380_hostdata, main_task);
        struct Scsi_Host *instance = hostdata->host;
-       struct scsi_cmnd *tmp, *prev;
+       struct scsi_cmnd *cmd;
        int done;
-       
-       spin_lock_irq(instance->host_lock);
+
        do {
-               /* Lock held here */
                done = 1;
-               if (!hostdata->connected && !hostdata->selecting) {
-                       dprintk(NDEBUG_MAIN, "scsi%d : not connected\n", instance->host_no);
-                       /*
-                        * Search through the issue_queue for a command destined
-                        * for a target that's not busy.
-                        */
-                       for (tmp = (struct scsi_cmnd *) hostdata->issue_queue, prev = NULL; tmp; prev = tmp, tmp = (struct scsi_cmnd *) tmp->host_scribble)
-                       {
-                               if (prev != tmp)
-                                   dprintk(NDEBUG_LISTS, "MAIN tmp=%p   target=%d   busy=%d lun=%llu\n", tmp, tmp->device->id, hostdata->busy[tmp->device->id], tmp->device->lun);
-                               /*  When we find one, remove it from the issue queue. */
-                               if (!(hostdata->busy[tmp->device->id] &
-                                     (1 << (u8)(tmp->device->lun & 0xff)))) {
-                                       if (prev) {
-                                               REMOVE(prev, prev->host_scribble, tmp, tmp->host_scribble);
-                                               prev->host_scribble = tmp->host_scribble;
-                                       } else {
-                                               REMOVE(-1, hostdata->issue_queue, tmp, tmp->host_scribble);
-                                               hostdata->issue_queue = (struct scsi_cmnd *) tmp->host_scribble;
-                                       }
-                                       tmp->host_scribble = NULL;
 
-                                       /* 
-                                        * Attempt to establish an I_T_L nexus here. 
-                                        * On success, instance->hostdata->connected is set.
-                                        * On failure, we must add the command back to the
-                                        *   issue queue so we can keep trying. 
-                                        */
-                                       dprintk(NDEBUG_MAIN|NDEBUG_QUEUES, "scsi%d : main() : command for target %d lun %llu removed from issue_queue\n", instance->host_no, tmp->device->id, tmp->device->lun);
-       
-                                       /*
-                                        * A successful selection is defined as one that 
-                                        * leaves us with the command connected and 
-                                        * in hostdata->connected, OR has terminated the
-                                        * command.
-                                        *
-                                        * With successful commands, we fall through
-                                        * and see if we can do an information transfer,
-                                        * with failures we will restart.
-                                        */
-                                       hostdata->selecting = NULL;
-                                       /* RvC: have to preset this to indicate a new command is being performed */
+               spin_lock_irq(&hostdata->lock);
+               while (!hostdata->connected &&
+                      (cmd = dequeue_next_cmd(instance))) {
 
-                                       /*
-                                        * REQUEST SENSE commands are issued without tagged
-                                        * queueing, even on SCSI-II devices because the
-                                        * contingent allegiance condition exists for the
-                                        * entire unit.
-                                        */
+                       dsprintk(NDEBUG_MAIN, instance, "main: dequeued %p\n", cmd);
 
-                                       if (!NCR5380_select(instance, tmp)) {
-                                               break;
-                                       } else {
-                                               LIST(tmp, hostdata->issue_queue);
-                                               tmp->host_scribble = (unsigned char *) hostdata->issue_queue;
-                                               hostdata->issue_queue = tmp;
-                                               done = 0;
-                                               dprintk(NDEBUG_MAIN|NDEBUG_QUEUES, "scsi%d : main(): select() failed, returned to issue_queue\n", instance->host_no);
-                                       }
-                                       /* lock held here still */
-                               }       /* if target/lun is not busy */
-                       }       /* for */
-                       /* exited locked */
-               }       /* if (!hostdata->connected) */
-               if (hostdata->selecting) {
-                       tmp = (struct scsi_cmnd *) hostdata->selecting;
-                       /* Selection will drop and retake the lock */
-                       if (!NCR5380_select(instance, tmp)) {
-                               /* Ok ?? */
+                       /*
+                        * Attempt to establish an I_T_L nexus here.
+                        * On success, instance->hostdata->connected is set.
+                        * On failure, we must add the command back to the
+                        * issue queue so we can keep trying.
+                        */
+                       /*
+                        * REQUEST SENSE commands are issued without tagged
+                        * queueing, even on SCSI-II devices because the
+                        * contingent allegiance condition exists for the
+                        * entire unit.
+                        */
+
+                       cmd = NCR5380_select(instance, cmd);
+                       if (!cmd) {
+                               dsprintk(NDEBUG_MAIN, instance, "main: select complete\n");
                        } else {
-                               /* RvC: device failed, so we wait a long time
-                                  this is needed for Mustek scanners, that
-                                  do not respond to commands immediately
-                                  after a scan */
-                               printk(KERN_DEBUG "scsi%d: device %d did not respond in time\n", instance->host_no, tmp->device->id);
-                               LIST(tmp, hostdata->issue_queue);
-                               tmp->host_scribble = (unsigned char *) hostdata->issue_queue;
-                               hostdata->issue_queue = tmp;
-                               NCR5380_set_timer(hostdata, USLEEP_WAITLONG);
+                               dsprintk(NDEBUG_MAIN | NDEBUG_QUEUES, instance,
+                                        "main: select failed, returning %p to queue\n", cmd);
+                               requeue_cmd(instance, cmd);
                        }
-               }       /* if hostdata->selecting */
+               }
                if (hostdata->connected
 #ifdef REAL_DMA
                    && !hostdata->dmalen
 #endif
-                   && (!hostdata->time_expires || time_before_eq(hostdata->time_expires, jiffies))
                    ) {
-                       dprintk(NDEBUG_MAIN, "scsi%d : main() : performing information transfer\n", instance->host_no);
+                       dsprintk(NDEBUG_MAIN, instance, "main: performing information transfer\n");
                        NCR5380_information_transfer(instance);
-                       dprintk(NDEBUG_MAIN, "scsi%d : main() : done set false\n", instance->host_no);
                        done = 0;
-               } else
-                       break;
+               }
+               spin_unlock_irq(&hostdata->lock);
+               if (!done)
+                       cond_resched();
        } while (!done);
-       
-       spin_unlock_irq(instance->host_lock);
 }
 
 #ifndef DONT_USE_INTR
 
 /**
- *     NCR5380_intr    -       generic NCR5380 irq handler
- *     @irq: interrupt number
- *     @dev_id: device info
- *
- *     Handle interrupts, reestablishing I_T_L or I_T_L_Q nexuses
- *      from the disconnected queue, and restarting NCR5380_main() 
- *      as required.
- *
- *     Locks: takes the needed instance locks
+ * NCR5380_intr - generic NCR5380 irq handler
+ * @irq: interrupt number
+ * @dev_id: device info
+ *
+ * Handle interrupts, reestablishing I_T_L or I_T_L_Q nexuses
+ * from the disconnected queue, and restarting NCR5380_main()
+ * as required.
+ *
+ * The chip can assert IRQ in any of six different conditions. The IRQ flag
+ * is then cleared by reading the Reset Parity/Interrupt Register (RPIR).
+ * Three of these six conditions are latched in the Bus and Status Register:
+ * - End of DMA (cleared by ending DMA Mode)
+ * - Parity error (cleared by reading RPIR)
+ * - Loss of BSY (cleared by reading RPIR)
+ * Two conditions have flag bits that are not latched:
+ * - Bus phase mismatch (non-maskable in DMA Mode, cleared by ending DMA Mode)
+ * - Bus reset (non-maskable)
+ * The remaining condition has no flag bit at all:
+ * - Selection/reselection
+ *
+ * Hence, establishing the cause(s) of any interrupt is partly guesswork.
+ * In "The DP8490 and DP5380 Comparison Guide", National Semiconductor
+ * claimed that "the design of the [DP8490] interrupt logic ensures
+ * interrupts will not be lost (they can be on the DP5380)."
+ * The L5380/53C80 datasheet from LOGIC Devices has more details.
+ *
+ * Checking for bus reset by reading RST is futile because of interrupt
+ * latency, but a bus reset will reset chip logic. Checking for parity error
+ * is unnecessary because that interrupt is never enabled. A Loss of BSY
+ * condition will clear DMA Mode. We can tell when this occurs because the
+ * the Busy Monitor interrupt is enabled together with DMA Mode.
  */
 
-static irqreturn_t NCR5380_intr(int dummy, void *dev_id)
+static irqreturn_t NCR5380_intr(int irq, void *dev_id)
 {
-       NCR5380_local_declare();
        struct Scsi_Host *instance = dev_id;
-       struct NCR5380_hostdata *hostdata = (struct NCR5380_hostdata *) instance->hostdata;
-       int done;
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
+       int handled = 0;
        unsigned char basr;
        unsigned long flags;
 
-       dprintk(NDEBUG_INTR, "scsi : NCR5380 irq %d triggered\n",
-               instance->irq);
+       spin_lock_irqsave(&hostdata->lock, flags);
+
+       basr = NCR5380_read(BUS_AND_STATUS_REG);
+       if (basr & BASR_IRQ) {
+               unsigned char mr = NCR5380_read(MODE_REG);
+               unsigned char sr = NCR5380_read(STATUS_REG);
+
+               dsprintk(NDEBUG_INTR, instance, "IRQ %d, BASR 0x%02x, SR 0x%02x, MR 0x%02x\n",
+                        irq, basr, sr, mr);
 
-       do {
-               done = 1;
-               spin_lock_irqsave(instance->host_lock, flags);
-               /* Look for pending interrupts */
-               NCR5380_setup(instance);
-               basr = NCR5380_read(BUS_AND_STATUS_REG);
-               /* XXX dispatch to appropriate routine if found and done=0 */
-               if (basr & BASR_IRQ) {
-                       NCR5380_dprint(NDEBUG_INTR, instance);
-                       if ((NCR5380_read(STATUS_REG) & (SR_SEL | SR_IO)) == (SR_SEL | SR_IO)) {
-                               done = 0;
-                               dprintk(NDEBUG_INTR, "scsi%d : SEL interrupt\n", instance->host_no);
-                               NCR5380_reselect(instance);
-                               (void) NCR5380_read(RESET_PARITY_INTERRUPT_REG);
-                       } else if (basr & BASR_PARITY_ERROR) {
-                               dprintk(NDEBUG_INTR, "scsi%d : PARITY interrupt\n", instance->host_no);
-                               (void) NCR5380_read(RESET_PARITY_INTERRUPT_REG);
-                       } else if ((NCR5380_read(STATUS_REG) & SR_RST) == SR_RST) {
-                               dprintk(NDEBUG_INTR, "scsi%d : RESET interrupt\n", instance->host_no);
-                               (void) NCR5380_read(RESET_PARITY_INTERRUPT_REG);
-                       } else {
 #if defined(REAL_DMA)
-                               /*
-                                * We should only get PHASE MISMATCH and EOP interrupts
-                                * if we have DMA enabled, so do a sanity check based on
-                                * the current setting of the MODE register.
-                                */
+               if ((mr & MR_DMA_MODE) || (mr & MR_MONITOR_BSY)) {
+                       /* Probably End of DMA, Phase Mismatch or Loss of BSY.
+                        * We ack IRQ after clearing Mode Register. Workarounds
+                        * for End of DMA errata need to happen in DMA Mode.
+                        */
 
-                               if ((NCR5380_read(MODE_REG) & MR_DMA) && ((basr & BASR_END_DMA_TRANSFER) || !(basr & BASR_PHASE_MATCH))) {
-                                       int transferred;
+                       dsprintk(NDEBUG_INTR, instance, "interrupt in DMA mode\n");
 
-                                       if (!hostdata->connected)
-                                               panic("scsi%d : received end of DMA interrupt with no connected cmd\n", instance->hostno);
+                       int transferred;
 
-                                       transferred = (hostdata->dmalen - NCR5380_dma_residual(instance));
-                                       hostdata->connected->SCp.this_residual -= transferred;
-                                       hostdata->connected->SCp.ptr += transferred;
-                                       hostdata->dmalen = 0;
+                       if (!hostdata->connected)
+                               panic("scsi%d : DMA interrupt with no connected cmd\n",
+                                     instance->hostno);
 
-                                       (void) NCR5380_read(RESET_PARITY_INTERRUPT_REG);
-                                                       
-                                       /* FIXME: we need to poll briefly then defer a workqueue task ! */
-                                       NCR5380_poll_politely(hostdata, BUS_AND_STATUS_REG, BASR_ACK, 0, 2*HZ);
+                       transferred = hostdata->dmalen - NCR5380_dma_residual(instance);
+                       hostdata->connected->SCp.this_residual -= transferred;
+                       hostdata->connected->SCp.ptr += transferred;
+                       hostdata->dmalen = 0;
 
-                                       NCR5380_write(MODE_REG, MR_BASE);
-                                       NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-                               }
-#else
-                               dprintk(NDEBUG_INTR, "scsi : unknown interrupt, BASR 0x%X, MR 0x%X, SR 0x%x\n", basr, NCR5380_read(MODE_REG), NCR5380_read(STATUS_REG));
-                               (void) NCR5380_read(RESET_PARITY_INTERRUPT_REG);
-#endif
+                       /* FIXME: we need to poll briefly then defer a workqueue task ! */
+                       NCR5380_poll_politely(hostdata, BUS_AND_STATUS_REG, BASR_ACK, 0, 2 * HZ);
+
+                       NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
+                       NCR5380_write(MODE_REG, MR_BASE);
+                       NCR5380_read(RESET_PARITY_INTERRUPT_REG);
+               } else
+#endif /* REAL_DMA */
+               if ((NCR5380_read(CURRENT_SCSI_DATA_REG) & hostdata->id_mask) &&
+                   (sr & (SR_SEL | SR_IO | SR_BSY | SR_RST)) == (SR_SEL | SR_IO)) {
+                       /* Probably reselected */
+                       NCR5380_write(SELECT_ENABLE_REG, 0);
+                       NCR5380_read(RESET_PARITY_INTERRUPT_REG);
+
+                       dsprintk(NDEBUG_INTR, instance, "interrupt with SEL and IO\n");
+
+                       if (!hostdata->connected) {
+                               NCR5380_reselect(instance);
+                               queue_work(hostdata->work_q, &hostdata->main_task);
                        }
-               }       /* if BASR_IRQ */
-               spin_unlock_irqrestore(instance->host_lock, flags);
-               if(!done)
-                       schedule_delayed_work(&hostdata->coroutine, 0);
-       } while (!done);
-       return IRQ_HANDLED;
+                       if (!hostdata->connected)
+                               NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
+               } else {
+                       /* Probably Bus Reset */
+                       NCR5380_read(RESET_PARITY_INTERRUPT_REG);
+
+                       dsprintk(NDEBUG_INTR, instance, "unknown interrupt\n");
+               }
+               handled = 1;
+       } else {
+               shost_printk(KERN_NOTICE, instance, "interrupt without IRQ bit\n");
+       }
+
+       spin_unlock_irqrestore(&hostdata->lock, flags);
+
+       return IRQ_RETVAL(handled);
 }
 
-#endif 
+#endif
 
-/* 
+/*
  * Function : int NCR5380_select(struct Scsi_Host *instance,
- *                               struct scsi_cmnd *cmd)
+ * struct scsi_cmnd *cmd)
  *
  * Purpose : establishes I_T_L or I_T_L_Q nexus for new or existing command,
- *      including ARBITRATION, SELECTION, and initial message out for 
- *      IDENTIFY and queue messages. 
- *
- * Inputs : instance - instantiation of the 5380 driver on which this 
- *      target lives, cmd - SCSI command to execute.
- * 
- * Returns : -1 if selection could not execute for some reason,
- *      0 if selection succeeded or failed because the target 
- *      did not respond.
- *
- * Side effects : 
- *      If bus busy, arbitration failed, etc, NCR5380_select() will exit 
- *              with registers as they should have been on entry - ie
- *              SELECT_ENABLE will be set appropriately, the NCR5380
- *              will cease to drive any SCSI bus signals.
- *
- *      If successful : I_T_L or I_T_L_Q nexus will be established, 
- *              instance->connected will be set to cmd.  
- *              SELECT interrupt will be disabled.
- *
- *      If failed (no target) : cmd->scsi_done() will be called, and the 
- *              cmd->result host byte set to DID_BAD_TARGET.
- *
- *     Locks: caller holds hostdata lock in IRQ mode
+ * including ARBITRATION, SELECTION, and initial message out for
+ * IDENTIFY and queue messages.
+ *
+ * Inputs : instance - instantiation of the 5380 driver on which this
+ * target lives, cmd - SCSI command to execute.
+ *
+ * Returns cmd if selection failed but should be retried,
+ * NULL if selection failed and should not be retried, or
+ * NULL if selection succeeded (hostdata->connected == cmd).
+ *
+ * Side effects :
+ * If bus busy, arbitration failed, etc, NCR5380_select() will exit
+ * with registers as they should have been on entry - ie
+ * SELECT_ENABLE will be set appropriately, the NCR5380
+ * will cease to drive any SCSI bus signals.
+ *
+ * If successful : I_T_L or I_T_L_Q nexus will be established,
+ * instance->connected will be set to cmd.
+ * SELECT interrupt will be disabled.
+ *
+ * If failed (no target) : cmd->scsi_done() will be called, and the
+ * cmd->result host byte set to DID_BAD_TARGET.
  */
-static int NCR5380_select(struct Scsi_Host *instance, struct scsi_cmnd *cmd)
+
+static struct scsi_cmnd *NCR5380_select(struct Scsi_Host *instance,
+                                        struct scsi_cmnd *cmd)
 {
-       NCR5380_local_declare();
-       struct NCR5380_hostdata *hostdata = (struct NCR5380_hostdata *) instance->hostdata;
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
        unsigned char tmp[3], phase;
        unsigned char *data;
        int len;
-       unsigned long timeout;
-       unsigned char value;
        int err;
-       NCR5380_setup(instance);
-
-       if (hostdata->selecting)
-               goto part2;
-
-       hostdata->restart_select = 0;
 
        NCR5380_dprint(NDEBUG_ARBITRATION, instance);
-       dprintk(NDEBUG_ARBITRATION, "scsi%d : starting arbitration, id = %d\n", instance->host_no, instance->this_id);
+       dsprintk(NDEBUG_ARBITRATION, instance, "starting arbitration, id = %d\n",
+                instance->this_id);
+
+       /*
+        * Arbitration and selection phases are slow and involve dropping the
+        * lock, so we have to watch out for EH. An exception handler may
+        * change 'selecting' to NULL. This function will then return NULL
+        * so that the caller will forget about 'cmd'. (During information
+        * transfer phases, EH may change 'connected' to NULL.)
+        */
+       hostdata->selecting = cmd;
 
-       /* 
-        * Set the phase bits to 0, otherwise the NCR5380 won't drive the 
+       /*
+        * Set the phase bits to 0, otherwise the NCR5380 won't drive the
         * data bus during SELECTION.
         */
 
        NCR5380_write(TARGET_COMMAND_REG, 0);
 
-       /* 
+       /*
         * Start arbitration.
         */
 
        NCR5380_write(OUTPUT_DATA_REG, hostdata->id_mask);
        NCR5380_write(MODE_REG, MR_ARBITRATE);
 
+       /* The chip now waits for BUS FREE phase. Then after the 800 ns
+        * Bus Free Delay, arbitration will begin.
+        */
 
-       /* We can be relaxed here, interrupts are on, we are
-          in workqueue context, the birds are singing in the trees */
-       spin_unlock_irq(instance->host_lock);
-       err = NCR5380_poll_politely(instance, INITIATOR_COMMAND_REG, ICR_ARBITRATION_PROGRESS, ICR_ARBITRATION_PROGRESS, 5*HZ);
-       spin_lock_irq(instance->host_lock);
+       spin_unlock_irq(&hostdata->lock);
+       err = NCR5380_poll_politely2(instance, MODE_REG, MR_ARBITRATE, 0,
+                       INITIATOR_COMMAND_REG, ICR_ARBITRATION_PROGRESS,
+                                              ICR_ARBITRATION_PROGRESS, HZ);
+       spin_lock_irq(&hostdata->lock);
+       if (!(NCR5380_read(MODE_REG) & MR_ARBITRATE)) {
+               /* Reselection interrupt */
+               goto out;
+       }
        if (err < 0) {
-               printk(KERN_DEBUG "scsi: arbitration timeout at %d\n", __LINE__);
                NCR5380_write(MODE_REG, MR_BASE);
-               NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
-               goto failed;
+               shost_printk(KERN_ERR, instance,
+                            "select: arbitration timeout\n");
+               goto out;
        }
+       spin_unlock_irq(&hostdata->lock);
 
-       dprintk(NDEBUG_ARBITRATION, "scsi%d : arbitration complete\n", instance->host_no);
-
-       /* 
-        * The arbitration delay is 2.2us, but this is a minimum and there is 
-        * no maximum so we can safely sleep for ceil(2.2) usecs to accommodate
-        * the integral nature of udelay().
-        *
-        */
-
+       /* The SCSI-2 arbitration delay is 2.4 us */
        udelay(3);
 
        /* Check for lost arbitration */
-       if ((NCR5380_read(INITIATOR_COMMAND_REG) & ICR_ARBITRATION_LOST) || (NCR5380_read(CURRENT_SCSI_DATA_REG) & hostdata->id_higher_mask) || (NCR5380_read(INITIATOR_COMMAND_REG) & ICR_ARBITRATION_LOST)) {
-               NCR5380_write(MODE_REG, MR_BASE);
-               dprintk(NDEBUG_ARBITRATION, "scsi%d : lost arbitration, deasserting MR_ARBITRATE\n", instance->host_no);
-               goto failed;
-       }
-       NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_SEL);
-
-       if (!(hostdata->flags & FLAG_DTC3181E) &&
-           /* RvC: DTC3181E has some trouble with this
-            *      so we simply removed it. Seems to work with
-            *      only Mustek scanner attached
-            */
+       if ((NCR5380_read(INITIATOR_COMMAND_REG) & ICR_ARBITRATION_LOST) ||
+           (NCR5380_read(CURRENT_SCSI_DATA_REG) & hostdata->id_higher_mask) ||
            (NCR5380_read(INITIATOR_COMMAND_REG) & ICR_ARBITRATION_LOST)) {
                NCR5380_write(MODE_REG, MR_BASE);
-               NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-               dprintk(NDEBUG_ARBITRATION, "scsi%d : lost arbitration, deasserting ICR_ASSERT_SEL\n", instance->host_no);
-               goto failed;
+               dsprintk(NDEBUG_ARBITRATION, instance, "lost arbitration, deasserting MR_ARBITRATE\n");
+               spin_lock_irq(&hostdata->lock);
+               goto out;
        }
-       /* 
-        * Again, bus clear + bus settle time is 1.2us, however, this is 
+
+       /* After/during arbitration, BSY should be asserted.
+        * IBM DPES-31080 Version S31Q works now
+        * Tnx to Thomas_Roesch@m2.maus.de for finding this! (Roman)
+        */
+       NCR5380_write(INITIATOR_COMMAND_REG,
+                     ICR_BASE | ICR_ASSERT_SEL | ICR_ASSERT_BSY);
+
+       /*
+        * Again, bus clear + bus settle time is 1.2us, however, this is
         * a minimum so we'll udelay ceil(1.2)
         */
 
-       udelay(2);
+       if (hostdata->flags & FLAG_TOSHIBA_DELAY)
+               udelay(15);
+       else
+               udelay(2);
+
+       spin_lock_irq(&hostdata->lock);
+
+       /* NCR5380_reselect() clears MODE_REG after a reselection interrupt */
+       if (!(NCR5380_read(MODE_REG) & MR_ARBITRATE))
+               goto out;
 
-       dprintk(NDEBUG_ARBITRATION, "scsi%d : won arbitration\n", instance->host_no);
+       if (!hostdata->selecting) {
+               NCR5380_write(MODE_REG, MR_BASE);
+               NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
+               goto out;
+       }
 
-       /* 
-        * Now that we have won arbitration, start Selection process, asserting 
+       dsprintk(NDEBUG_ARBITRATION, instance, "won arbitration\n");
+
+       /*
+        * Now that we have won arbitration, start Selection process, asserting
         * the host and target ID's on the SCSI bus.
         */
 
-       NCR5380_write(OUTPUT_DATA_REG, (hostdata->id_mask | (1 << scmd_id(cmd))));
+       NCR5380_write(OUTPUT_DATA_REG, hostdata->id_mask | (1 << scmd_id(cmd)));
 
-       /* 
+       /*
         * Raise ATN while SEL is true before BSY goes false from arbitration,
         * since this is the only way to guarantee that we'll get a MESSAGE OUT
         * phase immediately after selection.
         */
 
-       NCR5380_write(INITIATOR_COMMAND_REG, (ICR_BASE | ICR_ASSERT_BSY | ICR_ASSERT_DATA | ICR_ASSERT_ATN | ICR_ASSERT_SEL));
+       NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_BSY |
+                     ICR_ASSERT_DATA | ICR_ASSERT_ATN | ICR_ASSERT_SEL);
        NCR5380_write(MODE_REG, MR_BASE);
 
-       /* 
+       /*
         * Reselect interrupts must be turned off prior to the dropping of BSY,
         * otherwise we will trigger an interrupt.
         */
        NCR5380_write(SELECT_ENABLE_REG, 0);
 
+       spin_unlock_irq(&hostdata->lock);
+
        /*
-        * The initiator shall then wait at least two deskew delays and release 
+        * The initiator shall then wait at least two deskew delays and release
         * the BSY signal.
         */
-       udelay(1);              /* wingel -- wait two bus deskew delay >2*45ns */
+       udelay(1);        /* wingel -- wait two bus deskew delay >2*45ns */
 
        /* Reset BSY */
-       NCR5380_write(INITIATOR_COMMAND_REG, (ICR_BASE | ICR_ASSERT_DATA | ICR_ASSERT_ATN | ICR_ASSERT_SEL));
+       NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_DATA |
+                     ICR_ASSERT_ATN | ICR_ASSERT_SEL);
 
-       /* 
+       /*
         * Something weird happens when we cease to drive BSY - looks
-        * like the board/chip is letting us do another read before the 
+        * like the board/chip is letting us do another read before the
         * appropriate propagation delay has expired, and we're confusing
         * a BSY signal from ourselves as the target's response to SELECTION.
         *
         * A small delay (the 'C++' frontend breaks the pipeline with an
         * unnecessary jump, making it work on my 386-33/Trantor T128, the
-        * tighter 'C' code breaks and requires this) solves the problem - 
-        * the 1 us delay is arbitrary, and only used because this delay will 
-        * be the same on other platforms and since it works here, it should 
+        * tighter 'C' code breaks and requires this) solves the problem -
+        * the 1 us delay is arbitrary, and only used because this delay will
+        * be the same on other platforms and since it works here, it should
         * work there.
         *
         * wingel suggests that this could be due to failing to wait
@@ -1339,50 +1162,43 @@ static int NCR5380_select(struct Scsi_Host *instance, struct scsi_cmnd *cmd)
 
        udelay(1);
 
-       dprintk(NDEBUG_SELECTION, "scsi%d : selecting target %d\n", instance->host_no, scmd_id(cmd));
+       dsprintk(NDEBUG_SELECTION, instance, "selecting target %d\n", scmd_id(cmd));
 
-       /* 
-        * The SCSI specification calls for a 250 ms timeout for the actual 
+       /*
+        * The SCSI specification calls for a 250 ms timeout for the actual
         * selection.
         */
 
-       timeout = jiffies + msecs_to_jiffies(250);
+       err = NCR5380_poll_politely(instance, STATUS_REG, SR_BSY, SR_BSY,
+                                   msecs_to_jiffies(250));
 
-       /* 
-        * XXX very interesting - we're seeing a bounce where the BSY we 
-        * asserted is being reflected / still asserted (propagation delay?)
-        * and it's detecting as true.  Sigh.
-        */
-
-       hostdata->select_time = 0;      /* we count the clock ticks at which we polled */
-       hostdata->selecting = cmd;
-
-part2:
-       /* RvC: here we enter after a sleeping period, or immediately after
-          execution of part 1
-          we poll only once ech clock tick */
-       value = NCR5380_read(STATUS_REG) & (SR_BSY | SR_IO);
-
-       if (!value && (hostdata->select_time < HZ/4)) {
-               /* RvC: we still must wait for a device response */
-               hostdata->select_time++;        /* after 25 ticks the device has failed */
-               NCR5380_set_timer(hostdata, 1);
-               return 0;       /* RvC: we return here with hostdata->selecting set,
-                                  to go to sleep */
-       }
-
-       hostdata->selecting = NULL;/* clear this pointer, because we passed the
-                                          waiting period */
        if ((NCR5380_read(STATUS_REG) & (SR_SEL | SR_IO)) == (SR_SEL | SR_IO)) {
+               spin_lock_irq(&hostdata->lock);
                NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
                NCR5380_reselect(instance);
-               printk("scsi%d : reselection after won arbitration?\n", instance->host_no);
+               if (!hostdata->connected)
+                       NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
+               shost_printk(KERN_ERR, instance, "reselection after won arbitration?\n");
+               goto out;
+       }
+
+       if (err < 0) {
+               spin_lock_irq(&hostdata->lock);
+               NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
                NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
-               return -1;
+               /* Can't touch cmd if it has been reclaimed by the scsi ML */
+               if (hostdata->selecting) {
+                       cmd->result = DID_BAD_TARGET << 16;
+                       complete_cmd(instance, cmd);
+                       dsprintk(NDEBUG_SELECTION, instance, "target did not respond within 250ms\n");
+                       cmd = NULL;
+               }
+               goto out;
        }
-       /* 
-        * No less than two deskew delays after the initiator detects the 
-        * BSY signal is true, it shall release the SEL signal and may 
+
+       /*
+        * No less than two deskew delays after the initiator detects the
+        * BSY signal is true, it shall release the SEL signal and may
         * change the DATA BUS.                                     -wingel
         */
 
@@ -1390,53 +1206,38 @@ part2:
 
        NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN);
 
-       if (!(NCR5380_read(STATUS_REG) & SR_BSY)) {
-               NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-               if (hostdata->targets_present & (1 << scmd_id(cmd))) {
-                       printk(KERN_DEBUG "scsi%d : weirdness\n", instance->host_no);
-                       if (hostdata->restart_select)
-                               printk(KERN_DEBUG "\trestart select\n");
-                       NCR5380_dprint(NDEBUG_SELECTION, instance);
-                       NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
-                       return -1;
-               }
-               cmd->result = DID_BAD_TARGET << 16;
-               cmd->scsi_done(cmd);
-               NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
-               dprintk(NDEBUG_SELECTION, "scsi%d : target did not respond within 250ms\n", instance->host_no);
-               NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
-               return 0;
-       }
-       hostdata->targets_present |= (1 << scmd_id(cmd));
-
        /*
-        * Since we followed the SCSI spec, and raised ATN while SEL 
+        * Since we followed the SCSI spec, and raised ATN while SEL
         * was true but before BSY was false during selection, the information
         * transfer phase should be a MESSAGE OUT phase so that we can send the
         * IDENTIFY message.
-        * 
+        *
         * If SCSI-II tagged queuing is enabled, we also send a SIMPLE_QUEUE_TAG
         * message (2 bytes) with a tag ID that we increment with every command
         * until it wraps back to 0.
         *
         * XXX - it turns out that there are some broken SCSI-II devices,
-        *       which claim to support tagged queuing but fail when more than
-        *       some number of commands are issued at once.
+        * which claim to support tagged queuing but fail when more than
+        * some number of commands are issued at once.
         */
 
        /* Wait for start of REQ/ACK handshake */
 
-       spin_unlock_irq(instance->host_lock);
        err = NCR5380_poll_politely(instance, STATUS_REG, SR_REQ, SR_REQ, HZ);
-       spin_lock_irq(instance->host_lock);
-       
-       if(err) {
-               printk(KERN_ERR "scsi%d: timeout at NCR5380.c:%d\n", instance->host_no, __LINE__);
+       spin_lock_irq(&hostdata->lock);
+       if (err < 0) {
+               shost_printk(KERN_ERR, instance, "select: REQ timeout\n");
+               NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
                NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
-               goto failed;
+               goto out;
+       }
+       if (!hostdata->selecting) {
+               do_abort(instance);
+               goto out;
        }
 
-       dprintk(NDEBUG_SELECTION, "scsi%d : target %d selected, going into MESSAGE OUT phase.\n", instance->host_no, cmd->device->id);
+       dsprintk(NDEBUG_SELECTION, instance, "target %d selected, going into MESSAGE OUT phase.\n",
+                scmd_id(cmd));
        tmp[0] = IDENTIFY(((instance->irq == NO_IRQ) ? 0 : 1), cmd->device->lun);
 
        len = 1;
@@ -1446,104 +1247,82 @@ part2:
        data = tmp;
        phase = PHASE_MSGOUT;
        NCR5380_transfer_pio(instance, &phase, &len, &data);
-       dprintk(NDEBUG_SELECTION, "scsi%d : nexus established.\n", instance->host_no);
+       dsprintk(NDEBUG_SELECTION, instance, "nexus established.\n");
        /* XXX need to handle errors here */
+
        hostdata->connected = cmd;
-       hostdata->busy[cmd->device->id] |= (1 << (cmd->device->lun & 0xFF));
+       hostdata->busy[cmd->device->id] |= 1 << cmd->device->lun;
 
        initialize_SCp(cmd);
 
-       return 0;
-
-       /* Selection failed */
-failed:
-       return -1;
+       cmd = NULL;
 
+out:
+       if (!hostdata->selecting)
+               return NULL;
+       hostdata->selecting = NULL;
+       return cmd;
 }
 
-/* 
- * Function : int NCR5380_transfer_pio (struct Scsi_Host *instance, 
- *      unsigned char *phase, int *count, unsigned char **data)
+/*
+ * Function : int NCR5380_transfer_pio (struct Scsi_Host *instance,
+ * unsigned char *phase, int *count, unsigned char **data)
  *
  * Purpose : transfers data in given phase using polled I/O
  *
- * Inputs : instance - instance of driver, *phase - pointer to 
- *      what phase is expected, *count - pointer to number of 
- *      bytes to transfer, **data - pointer to data pointer.
- * 
+ * Inputs : instance - instance of driver, *phase - pointer to
+ * what phase is expected, *count - pointer to number of
+ * bytes to transfer, **data - pointer to data pointer.
+ *
  * Returns : -1 when different phase is entered without transferring
- *      maximum number of bytes, 0 if all bytes or transferred or exit
- *      is in same phase.
+ * maximum number of bytes, 0 if all bytes are transferred or exit
+ * is in same phase.
  *
- *      Also, *phase, *count, *data are modified in place.
+ * Also, *phase, *count, *data are modified in place.
  *
  * XXX Note : handling for bus free may be useful.
  */
 
 /*
- * Note : this code is not as quick as it could be, however it 
+ * Note : this code is not as quick as it could be, however it
  * IS 100% reliable, and for the actual data transfer where speed
  * counts, we will always do a pseudo DMA or DMA transfer.
  */
 
-static int NCR5380_transfer_pio(struct Scsi_Host *instance, unsigned char *phase, int *count, unsigned char **data) {
-       NCR5380_local_declare();
+static int NCR5380_transfer_pio(struct Scsi_Host *instance,
+                               unsigned char *phase, int *count,
+                               unsigned char **data)
+{
        unsigned char p = *phase, tmp;
        int c = *count;
        unsigned char *d = *data;
-       /*
-        *      RvC: some administrative data to process polling time
-        */
-       int break_allowed = 0;
-       struct NCR5380_hostdata *hostdata = (struct NCR5380_hostdata *) instance->hostdata;
-       NCR5380_setup(instance);
-
-       if (!(p & SR_IO))
-               dprintk(NDEBUG_PIO, "scsi%d : pio write %d bytes\n", instance->host_no, c);
-       else
-               dprintk(NDEBUG_PIO, "scsi%d : pio read %d bytes\n", instance->host_no, c);
 
-       /* 
-        * The NCR5380 chip will only drive the SCSI bus when the 
+       /*
+        * The NCR5380 chip will only drive the SCSI bus when the
         * phase specified in the appropriate bits of the TARGET COMMAND
         * REGISTER match the STATUS REGISTER
         */
 
-        NCR5380_write(TARGET_COMMAND_REG, PHASE_SR_TO_TCR(p));
-
-       /* RvC: don't know if this is necessary, but other SCSI I/O is short
-        *      so breaks are not necessary there
-        */
-       if ((p == PHASE_DATAIN) || (p == PHASE_DATAOUT)) {
-               break_allowed = 1;
-       }
-       do {
-               /* 
-                * Wait for assertion of REQ, after which the phase bits will be 
-                * valid 
-                */
+       NCR5380_write(TARGET_COMMAND_REG, PHASE_SR_TO_TCR(p));
 
-               /* RvC: we simply poll once, after that we stop temporarily
-                *      and let the device buffer fill up
-                *      if breaking is not allowed, we keep polling as long as needed
+       do {
+               /*
+                * Wait for assertion of REQ, after which the phase bits will be
+                * valid
                 */
 
-               /* FIXME */
-               while (!((tmp = NCR5380_read(STATUS_REG)) & SR_REQ) && !break_allowed);
-               if (!(tmp & SR_REQ)) {
-                       /* timeout condition */
-                       NCR5380_set_timer(hostdata, USLEEP_SLEEP);
+               if (NCR5380_poll_politely(instance, STATUS_REG, SR_REQ, SR_REQ, HZ) < 0)
                        break;
-               }
 
-               dprintk(NDEBUG_HANDSHAKE, "scsi%d : REQ detected\n", instance->host_no);
+               dsprintk(NDEBUG_HANDSHAKE, instance, "REQ asserted\n");
 
                /* Check for phase mismatch */
-               if ((tmp & PHASE_MASK) != p) {
-                       dprintk(NDEBUG_HANDSHAKE, "scsi%d : phase mismatch\n", instance->host_no);
-                       NCR5380_dprint_phase(NDEBUG_HANDSHAKE, instance);
+               if ((NCR5380_read(STATUS_REG) & PHASE_MASK) != p) {
+                       dsprintk(NDEBUG_PIO, instance, "phase mismatch\n");
+                       NCR5380_dprint_phase(NDEBUG_PIO, instance);
                        break;
                }
+
                /* Do actual transfer from SCSI bus to / from memory */
                if (!(p & SR_IO))
                        NCR5380_write(OUTPUT_DATA_REG, *d);
@@ -1552,7 +1331,7 @@ static int NCR5380_transfer_pio(struct Scsi_Host *instance, unsigned char *phase
 
                ++d;
 
-               /* 
+               /*
                 * The SCSI standard suggests that in MSGOUT phase, the initiator
                 * should drop ATN on the last byte of the message phase
                 * after REQ has been asserted for the handshake but before
@@ -1563,29 +1342,34 @@ static int NCR5380_transfer_pio(struct Scsi_Host *instance, unsigned char *phase
                        if (!((p & SR_MSG) && c > 1)) {
                                NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_DATA);
                                NCR5380_dprint(NDEBUG_PIO, instance);
-                               NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_DATA | ICR_ASSERT_ACK);
+                               NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE |
+                                             ICR_ASSERT_DATA | ICR_ASSERT_ACK);
                        } else {
-                               NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_DATA | ICR_ASSERT_ATN);
+                               NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE |
+                                             ICR_ASSERT_DATA | ICR_ASSERT_ATN);
                                NCR5380_dprint(NDEBUG_PIO, instance);
-                               NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_DATA | ICR_ASSERT_ATN | ICR_ASSERT_ACK);
+                               NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE |
+                                             ICR_ASSERT_DATA | ICR_ASSERT_ATN | ICR_ASSERT_ACK);
                        }
                } else {
                        NCR5380_dprint(NDEBUG_PIO, instance);
                        NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ACK);
                }
 
-               /* FIXME - if this fails bus reset ?? */
-               NCR5380_poll_politely(instance, STATUS_REG, SR_REQ, 0, 5*HZ);
-               dprintk(NDEBUG_HANDSHAKE, "scsi%d : req false, handshake complete\n", instance->host_no);
+               if (NCR5380_poll_politely(instance,
+                                         STATUS_REG, SR_REQ, 0, 5 * HZ) < 0)
+                       break;
+
+               dsprintk(NDEBUG_HANDSHAKE, instance, "REQ negated, handshake complete\n");
 
 /*
- * We have several special cases to consider during REQ/ACK handshaking : 
- * 1.  We were in MSGOUT phase, and we are on the last byte of the 
- *      message.  ATN must be dropped as ACK is dropped.
+ * We have several special cases to consider during REQ/ACK handshaking :
+ * 1.  We were in MSGOUT phase, and we are on the last byte of the
+ * message.  ATN must be dropped as ACK is dropped.
  *
- * 2.  We are in a MSGIN phase, and we are on the last byte of the  
- *      message.  We must exit with ACK asserted, so that the calling
- *      code may raise ATN before dropping ACK to reject the message.
+ * 2.  We are in a MSGIN phase, and we are on the last byte of the
+ * message.  We must exit with ACK asserted, so that the calling
+ * code may raise ATN before dropping ACK to reject the message.
  *
  * 3.  ACK and ATN are clear and the target may proceed as normal.
  */
@@ -1597,12 +1381,16 @@ static int NCR5380_transfer_pio(struct Scsi_Host *instance, unsigned char *phase
                }
        } while (--c);
 
-       dprintk(NDEBUG_PIO, "scsi%d : residual %d\n", instance->host_no, c);
+       dsprintk(NDEBUG_PIO, instance, "residual %d\n", c);
 
        *count = c;
        *data = d;
        tmp = NCR5380_read(STATUS_REG);
-       if (tmp & SR_REQ)
+       /* The phase read from the bus is valid if either REQ is (already)
+        * asserted or if ACK hasn't been released yet. The latter applies if
+        * we're in MSG IN, DATA IN or STATUS and all bytes have been received.
+        */
+       if ((tmp & SR_REQ) || ((tmp & SR_IO) && c == 0))
                *phase = tmp & PHASE_MASK;
        else
                *phase = PHASE_UNKNOWN;
@@ -1614,79 +1402,80 @@ static int NCR5380_transfer_pio(struct Scsi_Host *instance, unsigned char *phase
 }
 
 /**
- *     do_reset        -       issue a reset command
- *     @host: adapter to reset
+ * do_reset - issue a reset command
+ * @instance: adapter to reset
  *
- *     Issue a reset sequence to the NCR5380 and try and get the bus
- *     back into sane shape.
+ * Issue a reset sequence to the NCR5380 and try and get the bus
+ * back into sane shape.
  *
- *     Locks: caller holds queue lock
+ * This clears the reset interrupt flag because there may be no handler for
+ * it. When the driver is initialized, the NCR5380_intr() handler has not yet
+ * been installed. And when in EH we may have released the ST DMA interrupt.
  */
-static void do_reset(struct Scsi_Host *host) {
-       NCR5380_local_declare();
-       NCR5380_setup(host);
 
-       NCR5380_write(TARGET_COMMAND_REG, PHASE_SR_TO_TCR(NCR5380_read(STATUS_REG) & PHASE_MASK));
+static void do_reset(struct Scsi_Host *instance)
+{
+       unsigned long flags;
+
+       local_irq_save(flags);
+       NCR5380_write(TARGET_COMMAND_REG,
+                     PHASE_SR_TO_TCR(NCR5380_read(STATUS_REG) & PHASE_MASK));
        NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_RST);
-       udelay(25);
+       udelay(50);
        NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
+       (void)NCR5380_read(RESET_PARITY_INTERRUPT_REG);
+       local_irq_restore(flags);
 }
 
-/*
- * Function : do_abort (Scsi_Host *host)
- * 
- * Purpose : abort the currently established nexus.  Should only be 
- *      called from a routine which can drop into a 
- * 
- * Returns : 0 on success, -1 on failure.
- *
- * Locks: queue lock held by caller
- *     FIXME: sort this out and get new_eh running
+/**
+ * do_abort - abort the currently established nexus by going to
+ * MESSAGE OUT phase and sending an ABORT message.
+ * @instance: relevant scsi host instance
+ *
+ * Returns 0 on success, -1 on failure.
  */
 
-static int do_abort(struct Scsi_Host *host) {
-       NCR5380_local_declare();
+static int do_abort(struct Scsi_Host *instance)
+{
        unsigned char *msgptr, phase, tmp;
        int len;
        int rc;
-       NCR5380_setup(host);
-
 
        /* Request message out phase */
        NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN);
 
-       /* 
-        * Wait for the target to indicate a valid phase by asserting 
-        * REQ.  Once this happens, we'll have either a MSGOUT phase 
-        * and can immediately send the ABORT message, or we'll have some 
+       /*
+        * Wait for the target to indicate a valid phase by asserting
+        * REQ.  Once this happens, we'll have either a MSGOUT phase
+        * and can immediately send the ABORT message, or we'll have some
         * other phase and will have to source/sink data.
-        * 
+        *
         * We really don't care what value was on the bus or what value
         * the target sees, so we just handshake.
         */
 
-       rc = NCR5380_poll_politely(host, STATUS_REG, SR_REQ, SR_REQ, 60 * HZ);
-       
-       if(rc < 0)
-               return -1;
+       rc = NCR5380_poll_politely(instance, STATUS_REG, SR_REQ, SR_REQ, 10 * HZ);
+       if (rc < 0)
+               goto timeout;
+
+       tmp = NCR5380_read(STATUS_REG) & PHASE_MASK;
 
-       tmp = (unsigned char)rc;
-       
        NCR5380_write(TARGET_COMMAND_REG, PHASE_SR_TO_TCR(tmp));
 
-       if ((tmp & PHASE_MASK) != PHASE_MSGOUT) {
-               NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN | ICR_ASSERT_ACK);
-               rc = NCR5380_poll_politely(host, STATUS_REG, SR_REQ, 0, 3*HZ);
+       if (tmp != PHASE_MSGOUT) {
+               NCR5380_write(INITIATOR_COMMAND_REG,
+                             ICR_BASE | ICR_ASSERT_ATN | ICR_ASSERT_ACK);
+               rc = NCR5380_poll_politely(instance, STATUS_REG, SR_REQ, 0, 3 * HZ);
+               if (rc < 0)
+                       goto timeout;
                NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN);
-               if(rc == -1)
-                       return -1;
        }
+
        tmp = ABORT;
        msgptr = &tmp;
        len = 1;
        phase = PHASE_MSGOUT;
-       NCR5380_transfer_pio(host, &phase, &len, &msgptr);
+       NCR5380_transfer_pio(instance, &phase, &len, &msgptr);
 
        /*
         * If we got here, and the command completed successfully,
@@ -1694,32 +1483,37 @@ static int do_abort(struct Scsi_Host *host) {
         */
 
        return len ? -1 : 0;
+
+timeout:
+       NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
+       return -1;
 }
 
 #if defined(REAL_DMA) || defined(PSEUDO_DMA) || defined (REAL_DMA_POLL)
-/* 
- * Function : int NCR5380_transfer_dma (struct Scsi_Host *instance, 
- *      unsigned char *phase, int *count, unsigned char **data)
+/*
+ * Function : int NCR5380_transfer_dma (struct Scsi_Host *instance,
+ * unsigned char *phase, int *count, unsigned char **data)
  *
  * Purpose : transfers data in given phase using either real
- *      or pseudo DMA.
+ * or pseudo DMA.
  *
- * Inputs : instance - instance of driver, *phase - pointer to 
- *      what phase is expected, *count - pointer to number of 
- *      bytes to transfer, **data - pointer to data pointer.
- * 
- * Returns : -1 when different phase is entered without transferring
- *      maximum number of bytes, 0 if all bytes or transferred or exit
- *      is in same phase.
+ * Inputs : instance - instance of driver, *phase - pointer to
+ * what phase is expected, *count - pointer to number of
+ * bytes to transfer, **data - pointer to data pointer.
  *
- *      Also, *phase, *count, *data are modified in place.
+ * Returns : -1 when different phase is entered without transferring
+ * maximum number of bytes, 0 if all bytes or transferred or exit
+ * is in same phase.
  *
- *     Locks: io_request lock held by caller
+ * Also, *phase, *count, *data are modified in place.
  */
 
 
-static int NCR5380_transfer_dma(struct Scsi_Host *instance, unsigned char *phase, int *count, unsigned char **data) {
-       NCR5380_local_declare();
+static int NCR5380_transfer_dma(struct Scsi_Host *instance,
+                               unsigned char *phase, int *count,
+                               unsigned char **data)
+{
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
        register int c = *count;
        register unsigned char p = *phase;
        register unsigned char *d = *data;
@@ -1730,54 +1524,47 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance, unsigned char *phase
        unsigned char saved_data = 0, overrun = 0, residue;
 #endif
 
-       struct NCR5380_hostdata *hostdata = (struct NCR5380_hostdata *) instance->hostdata;
-
-       NCR5380_setup(instance);
-
        if ((tmp = (NCR5380_read(STATUS_REG) & PHASE_MASK)) != p) {
                *phase = tmp;
                return -1;
        }
 #if defined(REAL_DMA) || defined(REAL_DMA_POLL)
-#ifdef READ_OVERRUNS
        if (p & SR_IO) {
-               c -= 2;
+               if (!(hostdata->flags & FLAG_NO_DMA_FIXUPS))
+                       c -= 2;
        }
-#endif
-       dprintk(NDEBUG_DMA, "scsi%d : initializing DMA channel %d for %s, %d bytes %s %0x\n", instance->host_no, instance->dma_channel, (p & SR_IO) ? "reading" : "writing", c, (p & SR_IO) ? "to" : "from", (unsigned) d);
        hostdata->dma_len = (p & SR_IO) ? NCR5380_dma_read_setup(instance, d, c) : NCR5380_dma_write_setup(instance, d, c);
+
+       dsprintk(NDEBUG_DMA, instance, "initializing DMA %s: length %d, address %p\n",
+                (p & SR_IO) ? "receive" : "send", c, *data);
 #endif
 
        NCR5380_write(TARGET_COMMAND_REG, PHASE_SR_TO_TCR(p));
 
 #ifdef REAL_DMA
-       NCR5380_write(MODE_REG, MR_BASE | MR_DMA_MODE | MR_ENABLE_EOP_INTR | MR_MONITOR_BSY);
+       NCR5380_write(MODE_REG, MR_BASE | MR_DMA_MODE | MR_MONITOR_BSY |
+                               MR_ENABLE_EOP_INTR);
 #elif defined(REAL_DMA_POLL)
-       NCR5380_write(MODE_REG, MR_BASE | MR_DMA_MODE);
+       NCR5380_write(MODE_REG, MR_BASE | MR_DMA_MODE | MR_MONITOR_BSY);
 #else
        /*
         * Note : on my sample board, watch-dog timeouts occurred when interrupts
-        * were not disabled for the duration of a single DMA transfer, from 
+        * were not disabled for the duration of a single DMA transfer, from
         * before the setting of DMA mode to after transfer of the last byte.
         */
 
-#if defined(PSEUDO_DMA) && defined(UNSAFE)
-       spin_unlock_irq(instance->host_lock);
-#endif
-       /* KLL May need eop and parity in 53c400 */
-       if (hostdata->flags & FLAG_NCR53C400)
-               NCR5380_write(MODE_REG, MR_BASE | MR_DMA_MODE |
-                               MR_ENABLE_PAR_CHECK | MR_ENABLE_PAR_INTR |
-                               MR_ENABLE_EOP_INTR | MR_MONITOR_BSY);
+       if (hostdata->flags & FLAG_NO_DMA_FIXUP)
+               NCR5380_write(MODE_REG, MR_BASE | MR_DMA_MODE | MR_MONITOR_BSY |
+                                       MR_ENABLE_EOP_INTR);
        else
-               NCR5380_write(MODE_REG, MR_BASE | MR_DMA_MODE);
+               NCR5380_write(MODE_REG, MR_BASE | MR_DMA_MODE | MR_MONITOR_BSY);
 #endif                         /* def REAL_DMA */
 
        dprintk(NDEBUG_DMA, "scsi%d : mode reg = 0x%X\n", instance->host_no, NCR5380_read(MODE_REG));
 
-       /* 
-        *      On the PAS16 at least I/O recovery delays are not needed here.
-        *      Everyone else seems to want them.
+       /*
+        * On the PAS16 at least I/O recovery delays are not needed here.
+        * Everyone else seems to want them.
         */
 
        if (p & SR_IO) {
@@ -1797,49 +1584,49 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance, unsigned char *phase
        } while ((tmp & BASR_PHASE_MATCH) && !(tmp & (BASR_BUSY_ERROR | BASR_END_DMA_TRANSFER)));
 
 /*
-   At this point, either we've completed DMA, or we have a phase mismatch,
-   or we've unexpectedly lost BUSY (which is a real error).
-
-   For write DMAs, we want to wait until the last byte has been
-   transferred out over the bus before we turn off DMA mode.  Alas, there
-   seems to be no terribly good way of doing this on a 5380 under all
-   conditions.  For non-scatter-gather operations, we can wait until REQ
-   and ACK both go false, or until a phase mismatch occurs.  Gather-writes
-   are nastier, since the device will be expecting more data than we
-   are prepared to send it, and REQ will remain asserted.  On a 53C8[01] we
-   could test LAST BIT SENT to assure transfer (I imagine this is precisely
-   why this signal was added to the newer chips) but on the older 538[01]
-   this signal does not exist.  The workaround for this lack is a watchdog;
-   we bail out of the wait-loop after a modest amount of wait-time if
-   the usual exit conditions are not met.  Not a terribly clean or
-   correct solution :-%
-
-   Reads are equally tricky due to a nasty characteristic of the NCR5380.
-   If the chip is in DMA mode for an READ, it will respond to a target's
-   REQ by latching the SCSI data into the INPUT DATA register and asserting
-   ACK, even if it has _already_ been notified by the DMA controller that
-   the current DMA transfer has completed!  If the NCR5380 is then taken
-   out of DMA mode, this already-acknowledged byte is lost.
-
-   This is not a problem for "one DMA transfer per command" reads, because
-   the situation will never arise... either all of the data is DMA'ed
-   properly, or the target switches to MESSAGE IN phase to signal a
-   disconnection (either operation bringing the DMA to a clean halt).
-   However, in order to handle scatter-reads, we must work around the
-   problem.  The chosen fix is to DMA N-2 bytes, then check for the
-   condition before taking the NCR5380 out of DMA mode.  One or two extra
-   bytes are transferred via PIO as necessary to fill out the original
-   request.
+ * At this point, either we've completed DMA, or we have a phase mismatch,
+ * or we've unexpectedly lost BUSY (which is a real error).
+ *
+ * For DMA sends, we want to wait until the last byte has been
+ * transferred out over the bus before we turn off DMA mode.  Alas, there
+ * seems to be no terribly good way of doing this on a 5380 under all
+ * conditions.  For non-scatter-gather operations, we can wait until REQ
+ * and ACK both go false, or until a phase mismatch occurs.  Gather-sends
+ * are nastier, since the device will be expecting more data than we
+ * are prepared to send it, and REQ will remain asserted.  On a 53C8[01] we
+ * could test Last Byte Sent to assure transfer (I imagine this is precisely
+ * why this signal was added to the newer chips) but on the older 538[01]
+ * this signal does not exist.  The workaround for this lack is a watchdog;
+ * we bail out of the wait-loop after a modest amount of wait-time if
+ * the usual exit conditions are not met.  Not a terribly clean or
+ * correct solution :-%
+ *
+ * DMA receive is equally tricky due to a nasty characteristic of the NCR5380.
+ * If the chip is in DMA receive mode, it will respond to a target's
+ * REQ by latching the SCSI data into the INPUT DATA register and asserting
+ * ACK, even if it has _already_ been notified by the DMA controller that
+ * the current DMA transfer has completed!  If the NCR5380 is then taken
+ * out of DMA mode, this already-acknowledged byte is lost. This is
+ * not a problem for "one DMA transfer per READ command", because
+ * the situation will never arise... either all of the data is DMA'ed
+ * properly, or the target switches to MESSAGE IN phase to signal a
+ * disconnection (either operation bringing the DMA to a clean halt).
+ * However, in order to handle scatter-receive, we must work around the
+ * problem.  The chosen fix is to DMA N-2 bytes, then check for the
+ * condition before taking the NCR5380 out of DMA mode.  One or two extra
+ * bytes are transferred via PIO as necessary to fill out the original
+ * request.
  */
 
        if (p & SR_IO) {
-#ifdef READ_OVERRUNS
-               udelay(10);
-               if (((NCR5380_read(BUS_AND_STATUS_REG) & (BASR_PHASE_MATCH | BASR_ACK)) == (BASR_PHASE_MATCH | BASR_ACK))) {
-                       saved_data = NCR5380_read(INPUT_DATA_REGISTER);
-                       overrun = 1;
+               if (!(hostdata->flags & FLAG_NO_DMA_FIXUPS)) {
+                       udelay(10);
+                       if ((NCR5380_read(BUS_AND_STATUS_REG) & (BASR_PHASE_MATCH | BASR_ACK)) ==
+                           (BASR_PHASE_MATCH | BASR_ACK)) {
+                               saved_data = NCR5380_read(INPUT_DATA_REGISTER);
+                               overrun = 1;
+                       }
                }
-#endif
        } else {
                int limit = 100;
                while (((tmp = NCR5380_read(BUS_AND_STATUS_REG)) & BASR_ACK) || (NCR5380_read(STATUS_REG) & SR_REQ)) {
@@ -1850,7 +1637,8 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance, unsigned char *phase
                }
        }
 
-       dprintk(NDEBUG_DMA, "scsi%d : polled DMA transfer complete, basr 0x%X, sr 0x%X\n", instance->host_no, tmp, NCR5380_read(STATUS_REG));
+       dsprintk(NDEBUG_DMA, "polled DMA transfer complete, basr 0x%02x, sr 0x%02x\n",
+                tmp, NCR5380_read(STATUS_REG));
 
        NCR5380_write(MODE_REG, MR_BASE);
        NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
@@ -1861,8 +1649,8 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance, unsigned char *phase
        *data += c;
        *phase = NCR5380_read(STATUS_REG) & PHASE_MASK;
 
-#ifdef READ_OVERRUNS
-       if (*phase == p && (p & SR_IO) && residue == 0) {
+       if (!(hostdata->flags & FLAG_NO_DMA_FIXUPS) &&
+           *phase == p && (p & SR_IO) && residue == 0) {
                if (overrun) {
                        dprintk(NDEBUG_DMA, "Got an input overrun, using saved byte\n");
                        **data = saved_data;
@@ -1877,7 +1665,6 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance, unsigned char *phase
                NCR5380_transfer_pio(instance, phase, &cnt, data);
                *count -= toPIO - cnt;
        }
-#endif
 
        dprintk(NDEBUG_DMA, "Return with data ptr = 0x%X, count %d, last 0x%X, next 0x%X\n", *data, *count, *(*data + *count - 1), *(*data + *count));
        return 0;
@@ -1886,95 +1673,64 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance, unsigned char *phase
        return 0;
 #else                          /* defined(REAL_DMA_POLL) */
        if (p & SR_IO) {
-#ifdef DMA_WORKS_RIGHT
-               foo = NCR5380_pread(instance, d, c);
-#else
-               int diff = 1;
-               if (hostdata->flags & FLAG_NCR53C400) {
-                       diff = 0;
-               }
-               if (!(foo = NCR5380_pread(instance, d, c - diff))) {
+               foo = NCR5380_pread(instance, d,
+                       hostdata->flags & FLAG_NO_DMA_FIXUP ? c : c - 1);
+               if (!foo && !(hostdata->flags & FLAG_NO_DMA_FIXUP)) {
                        /*
-                        * We can't disable DMA mode after successfully transferring 
+                        * We can't disable DMA mode after successfully transferring
                         * what we plan to be the last byte, since that would open up
-                        * a race condition where if the target asserted REQ before 
+                        * a race condition where if the target asserted REQ before
                         * we got the DMA mode reset, the NCR5380 would have latched
                         * an additional byte into the INPUT DATA register and we'd
                         * have dropped it.
-                        * 
-                        * The workaround was to transfer one fewer bytes than we 
-                        * intended to with the pseudo-DMA read function, wait for 
+                        *
+                        * The workaround was to transfer one fewer bytes than we
+                        * intended to with the pseudo-DMA read function, wait for
                         * the chip to latch the last byte, read it, and then disable
                         * pseudo-DMA mode.
-                        * 
+                        *
                         * After REQ is asserted, the NCR5380 asserts DRQ and ACK.
                         * REQ is deasserted when ACK is asserted, and not reasserted
                         * until ACK goes false.  Since the NCR5380 won't lower ACK
                         * until DACK is asserted, which won't happen unless we twiddle
-                        * the DMA port or we take the NCR5380 out of DMA mode, we 
-                        * can guarantee that we won't handshake another extra 
+                        * the DMA port or we take the NCR5380 out of DMA mode, we
+                        * can guarantee that we won't handshake another extra
                         * byte.
                         */
 
-                       if (!(hostdata->flags & FLAG_NCR53C400)) {
-                               while (!(NCR5380_read(BUS_AND_STATUS_REG) & BASR_DRQ));
-                               /* Wait for clean handshake */
-                               while (NCR5380_read(STATUS_REG) & SR_REQ);
-                               d[c - 1] = NCR5380_read(INPUT_DATA_REG);
+                       if (NCR5380_poll_politely(instance, BUS_AND_STATUS_REG,
+                                                 BASR_DRQ, BASR_DRQ, HZ) < 0) {
+                               foo = -1;
+                               shost_printk(KERN_ERR, instance, "PDMA read: DRQ timeout\n");
                        }
+                       if (NCR5380_poll_politely(instance, STATUS_REG,
+                                                 SR_REQ, 0, HZ) < 0) {
+                               foo = -1;
+                               shost_printk(KERN_ERR, instance, "PDMA read: !REQ timeout\n");
+                       }
+                       d[c - 1] = NCR5380_read(INPUT_DATA_REG);
                }
-#endif
        } else {
-#ifdef DMA_WORKS_RIGHT
                foo = NCR5380_pwrite(instance, d, c);
-#else
-               int timeout;
-               dprintk(NDEBUG_C400_PWRITE, "About to pwrite %d bytes\n", c);
-               if (!(foo = NCR5380_pwrite(instance, d, c))) {
+               if (!foo && !(hostdata->flags & FLAG_NO_DMA_FIXUP)) {
                        /*
-                        * Wait for the last byte to be sent.  If REQ is being asserted for 
-                        * the byte we're interested, we'll ACK it and it will go false.  
+                        * Wait for the last byte to be sent.  If REQ is being asserted for
+                        * the byte we're interested, we'll ACK it and it will go false.
                         */
-                       if (!(hostdata->flags & FLAG_HAS_LAST_BYTE_SENT)) {
-                               timeout = 20000;
-                               while (!(NCR5380_read(BUS_AND_STATUS_REG) & BASR_DRQ) && (NCR5380_read(BUS_AND_STATUS_REG) & BASR_PHASE_MATCH));
-
-                               if (!timeout)
-                                       dprintk(NDEBUG_LAST_BYTE_SENT, "scsi%d : timed out on last byte\n", instance->host_no);
-
-                               if (hostdata->flags & FLAG_CHECK_LAST_BYTE_SENT) {
-                                       hostdata->flags &= ~FLAG_CHECK_LAST_BYTE_SENT;
-                                       if (NCR5380_read(TARGET_COMMAND_REG) & TCR_LAST_BYTE_SENT) {
-                                               hostdata->flags |= FLAG_HAS_LAST_BYTE_SENT;
-                                               dprintk(NDEBUG_LAST_BYTE_SENT, "scsi%d : last byte sent works\n", instance->host_no);
-                                       }
-                               }
-                       } else {
-                               dprintk(NDEBUG_C400_PWRITE, "Waiting for LASTBYTE\n");
-                               while (!(NCR5380_read(TARGET_COMMAND_REG) & TCR_LAST_BYTE_SENT));
-                               dprintk(NDEBUG_C400_PWRITE, "Got LASTBYTE\n");
+                       if (NCR5380_poll_politely2(instance,
+                            BUS_AND_STATUS_REG, BASR_DRQ, BASR_DRQ,
+                            BUS_AND_STATUS_REG, BASR_PHASE_MATCH, 0, HZ) < 0) {
+                               foo = -1;
+                               shost_printk(KERN_ERR, instance, "PDMA write: DRQ and phase timeout\n");
                        }
                }
-#endif
        }
        NCR5380_write(MODE_REG, MR_BASE);
        NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-
-       if ((!(p & SR_IO)) && (hostdata->flags & FLAG_NCR53C400)) {
-               dprintk(NDEBUG_C400_PWRITE, "53C400w: Checking for IRQ\n");
-               if (NCR5380_read(BUS_AND_STATUS_REG) & BASR_IRQ) {
-                       dprintk(NDEBUG_C400_PWRITE, "53C400w:    got it, reading reset interrupt reg\n");
-                       NCR5380_read(RESET_PARITY_INTERRUPT_REG);
-               } else {
-                       printk("53C400w:    IRQ NOT THERE!\n");
-               }
-       }
+       NCR5380_read(RESET_PARITY_INTERRUPT_REG);
        *data = d + c;
        *count = 0;
        *phase = NCR5380_read(STATUS_REG) & PHASE_MASK;
-#if defined(PSEUDO_DMA) && defined(UNSAFE)
-       spin_lock_irq(instance->host_lock);
-#endif                         /* defined(REAL_DMA_POLL) */
        return foo;
 #endif                         /* def REAL_DMA */
 }
@@ -1983,25 +1739,23 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance, unsigned char *phase
 /*
  * Function : NCR5380_information_transfer (struct Scsi_Host *instance)
  *
- * Purpose : run through the various SCSI phases and do as the target 
- *      directs us to.  Operates on the currently connected command, 
- *      instance->connected.
+ * Purpose : run through the various SCSI phases and do as the target
+ * directs us to.  Operates on the currently connected command,
+ * instance->connected.
  *
  * Inputs : instance, instance for which we are doing commands
  *
- * Side effects : SCSI things happen, the disconnected queue will be 
- *      modified if a command disconnects, *instance->connected will
- *      change.
+ * Side effects : SCSI things happen, the disconnected queue will be
+ * modified if a command disconnects, *instance->connected will
+ * change.
  *
- * XXX Note : we need to watch for bus free or a reset condition here 
- *      to recover from an unexpected bus free condition.
- *
- * Locks: io_request_lock held by caller in IRQ mode
+ * XXX Note : we need to watch for bus free or a reset condition here
+ * to recover from an unexpected bus free condition.
  */
 
-static void NCR5380_information_transfer(struct Scsi_Host *instance) {
-       NCR5380_local_declare();
-       struct NCR5380_hostdata *hostdata = (struct NCR5380_hostdata *)instance->hostdata;
+static void NCR5380_information_transfer(struct Scsi_Host *instance)
+{
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
        unsigned char msgout = NOP;
        int sink = 0;
        int len;
@@ -2010,13 +1764,11 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) {
 #endif
        unsigned char *data;
        unsigned char phase, tmp, extended_msg[10], old_phase = 0xff;
-       struct scsi_cmnd *cmd = (struct scsi_cmnd *) hostdata->connected;
-       /* RvC: we need to set the end of the polling time */
-       unsigned long poll_time = jiffies + USLEEP_POLL;
+       struct scsi_cmnd *cmd;
 
-       NCR5380_setup(instance);
+       while ((cmd = hostdata->connected)) {
+               struct NCR5380_cmd *ncmd = scsi_cmd_priv(cmd);
 
-       while (1) {
                tmp = NCR5380_read(STATUS_REG);
                /* We only have a valid SCSI phase when REQ is asserted */
                if (tmp & SR_REQ) {
@@ -2028,24 +1780,28 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) {
                        if (sink && (phase != PHASE_MSGOUT)) {
                                NCR5380_write(TARGET_COMMAND_REG, PHASE_SR_TO_TCR(tmp));
 
-                               NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN | ICR_ASSERT_ACK);
-                               while (NCR5380_read(STATUS_REG) & SR_REQ);
-                               NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN);
+                               NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN |
+                                             ICR_ASSERT_ACK);
+                               while (NCR5380_read(STATUS_REG) & SR_REQ)
+                                       ;
+                               NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE |
+                                             ICR_ASSERT_ATN);
                                sink = 0;
                                continue;
                        }
+
                        switch (phase) {
-                       case PHASE_DATAIN:
                        case PHASE_DATAOUT:
 #if (NDEBUG & NDEBUG_NO_DATAOUT)
-                               printk("scsi%d : NDEBUG_NO_DATAOUT set, attempted DATAOUT aborted\n", instance->host_no);
+                               shost_printk(KERN_DEBUG, instance, "NDEBUG_NO_DATAOUT set, attempted DATAOUT aborted\n");
                                sink = 1;
                                do_abort(instance);
                                cmd->result = DID_ERROR << 16;
-                               cmd->scsi_done(cmd);
+                               complete_cmd(instance, cmd);
                                return;
 #endif
-                               /* 
+                       case PHASE_DATAIN:
+                               /*
                                 * If there is no room left in the current buffer in the
                                 * scatter-gather list, move onto the next one.
                                 */
@@ -2055,10 +1811,13 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) {
                                        --cmd->SCp.buffers_residual;
                                        cmd->SCp.this_residual = cmd->SCp.buffer->length;
                                        cmd->SCp.ptr = sg_virt(cmd->SCp.buffer);
-                                       dprintk(NDEBUG_INFORMATION, "scsi%d : %d bytes and %d buffers left\n", instance->host_no, cmd->SCp.this_residual, cmd->SCp.buffers_residual);
+                                       dsprintk(NDEBUG_INFORMATION, instance, "%d bytes and %d buffers left\n",
+                                                cmd->SCp.this_residual,
+                                                cmd->SCp.buffers_residual);
                                }
+
                                /*
-                                * The preferred transfer method is going to be 
+                                * The preferred transfer method is going to be
                                 * PSEUDO-DMA for systems that are strictly PIO,
                                 * since we can let the hardware do the handshaking.
                                 *
@@ -2068,50 +1827,39 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) {
                                 */
 
 #if defined(PSEUDO_DMA) || defined(REAL_DMA_POLL)
-                               /* KLL
-                                * PSEUDO_DMA is defined here. If this is the g_NCR5380
-                                * driver then it will always be defined, so the
-                                * FLAG_NO_PSEUDO_DMA is used to inhibit PDMA in the base
-                                * NCR5380 case.  I think this is a fairly clean solution.
-                                * We supplement these 2 if's with the flag.
-                                */
-#ifdef NCR5380_dma_xfer_len
-                               if (!cmd->device->borken && !(hostdata->flags & FLAG_NO_PSEUDO_DMA) && (transfersize = NCR5380_dma_xfer_len(instance, cmd)) != 0) {
-#else
-                               transfersize = cmd->transfersize;
-
-#ifdef LIMIT_TRANSFERSIZE      /* If we have problems with interrupt service */
-                               if (transfersize > 512)
-                                       transfersize = 512;
-#endif                         /* LIMIT_TRANSFERSIZE */
-
-                               if (!cmd->device->borken && transfersize && !(hostdata->flags & FLAG_NO_PSEUDO_DMA) && cmd->SCp.this_residual && !(cmd->SCp.this_residual % transfersize)) {
-                                       /* Limit transfers to 32K, for xx400 & xx406
-                                        * pseudoDMA that transfers in 128 bytes blocks. */
-                                       if (transfersize > 32 * 1024)
-                                               transfersize = 32 * 1024;
-#endif
+                               transfersize = 0;
+                               if (!cmd->device->borken &&
+                                   !(hostdata->flags & FLAG_NO_PSEUDO_DMA))
+                                       transfersize = NCR5380_dma_xfer_len(instance, cmd, phase);
+
+                               if (transfersize) {
                                        len = transfersize;
-                                       if (NCR5380_transfer_dma(instance, &phase, &len, (unsigned char **) &cmd->SCp.ptr)) {
+                                       if (NCR5380_transfer_dma(instance, &phase,
+                                           &len, (unsigned char **)&cmd->SCp.ptr)) {
                                                /*
-                                                * If the watchdog timer fires, all future accesses to this
-                                                * device will use the polled-IO.
+                                                * If the watchdog timer fires, all future
+                                                * accesses to this device will use the
+                                                * polled-IO.
                                                 */
                                                scmd_printk(KERN_INFO, cmd,
-                                                           "switching to slow handshake\n");
+                                                       "switching to slow handshake\n");
                                                cmd->device->borken = 1;
-                                               NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN);
                                                sink = 1;
                                                do_abort(instance);
                                                cmd->result = DID_ERROR << 16;
-                                               cmd->scsi_done(cmd);
+                                               complete_cmd(instance, cmd);
                                                /* XXX - need to source or sink data here, as appropriate */
                                        } else
                                                cmd->SCp.this_residual -= transfersize - len;
                                } else
 #endif                         /* defined(PSEUDO_DMA) || defined(REAL_DMA_POLL) */
-                                       NCR5380_transfer_pio(instance, &phase, (int *) &cmd->SCp.this_residual, (unsigned char **)
-                                                            &cmd->SCp.ptr);
+                               {
+                                       spin_unlock_irq(&hostdata->lock);
+                                       NCR5380_transfer_pio(instance, &phase,
+                                                            (int *)&cmd->SCp.this_residual,
+                                                            (unsigned char **)&cmd->SCp.ptr);
+                                       spin_lock_irq(&hostdata->lock);
+                               }
                                break;
                        case PHASE_MSGIN:
                                len = 1;
@@ -2120,101 +1868,42 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) {
                                cmd->SCp.Message = tmp;
 
                                switch (tmp) {
-                                       /*
-                                        * Linking lets us reduce the time required to get the 
-                                        * next command out to the device, hopefully this will
-                                        * mean we don't waste another revolution due to the delays
-                                        * required by ARBITRATION and another SELECTION.
-                                        *
-                                        * In the current implementation proposal, low level drivers
-                                        * merely have to start the next command, pointed to by 
-                                        * next_link, done() is called as with unlinked commands.
-                                        */
-#ifdef LINKED
-                               case LINKED_CMD_COMPLETE:
-                               case LINKED_FLG_CMD_COMPLETE:
-                                       /* Accept message by clearing ACK */
-                                       NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-                                       dprintk(NDEBUG_LINKED, "scsi%d : target %d lun %llu linked command complete.\n", instance->host_no, cmd->device->id, cmd->device->lun);
-                                       /* 
-                                        * Sanity check : A linked command should only terminate with
-                                        * one of these messages if there are more linked commands
-                                        * available.
-                                        */
-                                       if (!cmd->next_link) {
-                                           printk("scsi%d : target %d lun %llu linked command complete, no next_link\n" instance->host_no, cmd->device->id, cmd->device->lun);
-                                               sink = 1;
-                                               do_abort(instance);
-                                               return;
-                                       }
-                                       initialize_SCp(cmd->next_link);
-                                       /* The next command is still part of this process */
-                                       cmd->next_link->tag = cmd->tag;
-                                       cmd->result = cmd->SCp.Status | (cmd->SCp.Message << 8);
-                                       dprintk(NDEBUG_LINKED, "scsi%d : target %d lun %llu linked request done, calling scsi_done().\n", instance->host_no, cmd->device->id, cmd->device->lun);
-                                       cmd->scsi_done(cmd);
-                                       cmd = hostdata->connected;
-                                       break;
-#endif                         /* def LINKED */
                                case ABORT:
                                case COMMAND_COMPLETE:
                                        /* Accept message by clearing ACK */
                                        sink = 1;
                                        NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-                                       hostdata->connected = NULL;
-                                       dprintk(NDEBUG_QUEUES, "scsi%d : command for target %d, lun %llu completed\n", instance->host_no, cmd->device->id, cmd->device->lun);
-                                       hostdata->busy[cmd->device->id] &= ~(1 << (cmd->device->lun & 0xFF));
-
-                                       /* 
-                                        * I'm not sure what the correct thing to do here is : 
-                                        * 
-                                        * If the command that just executed is NOT a request 
-                                        * sense, the obvious thing to do is to set the result
-                                        * code to the values of the stored parameters.
-                                        * 
-                                        * If it was a REQUEST SENSE command, we need some way 
-                                        * to differentiate between the failure code of the original
-                                        * and the failure code of the REQUEST sense - the obvious
-                                        * case is success, where we fall through and leave the result
-                                        * code unchanged.
-                                        * 
-                                        * The non-obvious place is where the REQUEST SENSE failed 
-                                        */
-
-                                       if (cmd->cmnd[0] != REQUEST_SENSE)
-                                               cmd->result = cmd->SCp.Status | (cmd->SCp.Message << 8);
-                                       else if (status_byte(cmd->SCp.Status) != GOOD)
-                                               cmd->result = (cmd->result & 0x00ffff) | (DID_ERROR << 16);
-
-                                       if ((cmd->cmnd[0] == REQUEST_SENSE) &&
-                                               hostdata->ses.cmd_len) {
-                                               scsi_eh_restore_cmnd(cmd, &hostdata->ses);
-                                               hostdata->ses.cmd_len = 0 ;
-                                       }
+                                       dsprintk(NDEBUG_QUEUES, instance,
+                                                "COMMAND COMPLETE %p target %d lun %llu\n",
+                                                cmd, scmd_id(cmd), cmd->device->lun);
 
-                                       if ((cmd->cmnd[0] != REQUEST_SENSE) && (status_byte(cmd->SCp.Status) == CHECK_CONDITION)) {
-                                               scsi_eh_prep_cmnd(cmd, &hostdata->ses, NULL, 0, ~0);
-
-                                               dprintk(NDEBUG_AUTOSENSE, "scsi%d : performing request sense\n", instance->host_no);
+                                       hostdata->connected = NULL;
 
-                                               LIST(cmd, hostdata->issue_queue);
-                                               cmd->host_scribble = (unsigned char *)
-                                                   hostdata->issue_queue;
-                                               hostdata->issue_queue = (struct scsi_cmnd *) cmd;
-                                               dprintk(NDEBUG_QUEUES, "scsi%d : REQUEST SENSE added to head of issue queue\n", instance->host_no);
-                                       } else {
-                                               cmd->scsi_done(cmd);
+                                       cmd->result &= ~0xffff;
+                                       cmd->result |= cmd->SCp.Status;
+                                       cmd->result |= cmd->SCp.Message << 8;
+
+                                       if (cmd->cmnd[0] == REQUEST_SENSE)
+                                               complete_cmd(instance, cmd);
+                                       else {
+                                               if (cmd->SCp.Status == SAM_STAT_CHECK_CONDITION ||
+                                                   cmd->SCp.Status == SAM_STAT_COMMAND_TERMINATED) {
+                                                       dsprintk(NDEBUG_QUEUES, instance, "autosense: adding cmd %p to tail of autosense queue\n",
+                                                                cmd);
+                                                       list_add_tail(&ncmd->list,
+                                                                     &hostdata->autosense);
+                                               } else
+                                                       complete_cmd(instance, cmd);
                                        }
 
-                                       NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
-                                       /* 
-                                        * Restore phase bits to 0 so an interrupted selection, 
+                                       /*
+                                        * Restore phase bits to 0 so an interrupted selection,
                                         * arbitration can resume.
                                         */
                                        NCR5380_write(TARGET_COMMAND_REG, 0);
 
-                                       while ((NCR5380_read(STATUS_REG) & SR_BSY) && !hostdata->connected)
-                                               barrier();
+                                       /* Enable reselect interrupts */
+                                       NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
                                        return;
                                case MESSAGE_REJECT:
                                        /* Accept message by clearing ACK */
@@ -2229,38 +1918,33 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) {
                                        default:
                                                break;
                                        }
-                               case DISCONNECT:{
-                                               /* Accept message by clearing ACK */
-                                               NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-                                               cmd->device->disconnect = 1;
-                                               LIST(cmd, hostdata->disconnected_queue);
-                                               cmd->host_scribble = (unsigned char *)
-                                                   hostdata->disconnected_queue;
-                                               hostdata->connected = NULL;
-                                               hostdata->disconnected_queue = cmd;
-                                               dprintk(NDEBUG_QUEUES, "scsi%d : command for target %d lun %llu was moved from connected to" "  the disconnected_queue\n", instance->host_no, cmd->device->id, cmd->device->lun);
-                                               /* 
-                                                * Restore phase bits to 0 so an interrupted selection, 
-                                                * arbitration can resume.
-                                                */
-                                               NCR5380_write(TARGET_COMMAND_REG, 0);
-
-                                               /* Enable reselect interrupts */
-                                               NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
-                                               /* Wait for bus free to avoid nasty timeouts - FIXME timeout !*/
-                                               /* NCR538_poll_politely(instance, STATUS_REG, SR_BSY, 0, 30 * HZ); */
-                                               while ((NCR5380_read(STATUS_REG) & SR_BSY) && !hostdata->connected)
-                                                       barrier();
-                                               return;
-                                       }
-                                       /* 
+                                       break;
+                               case DISCONNECT:
+                                       /* Accept message by clearing ACK */
+                                       NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
+                                       hostdata->connected = NULL;
+                                       list_add(&ncmd->list, &hostdata->disconnected);
+                                       dsprintk(NDEBUG_INFORMATION | NDEBUG_QUEUES,
+                                                instance, "connected command %p for target %d lun %llu moved to disconnected queue\n",
+                                                cmd, scmd_id(cmd), cmd->device->lun);
+
+                                       /*
+                                        * Restore phase bits to 0 so an interrupted selection,
+                                        * arbitration can resume.
+                                        */
+                                       NCR5380_write(TARGET_COMMAND_REG, 0);
+
+                                       /* Enable reselect interrupts */
+                                       NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
+                                       return;
+                                       /*
                                         * The SCSI data pointer is *IMPLICITLY* saved on a disconnect
-                                        * operation, in violation of the SCSI spec so we can safely 
+                                        * operation, in violation of the SCSI spec so we can safely
                                         * ignore SAVE/RESTORE pointers calls.
                                         *
-                                        * Unfortunately, some disks violate the SCSI spec and 
+                                        * Unfortunately, some disks violate the SCSI spec and
                                         * don't issue the required SAVE_POINTERS message before
-                                        * disconnecting, and we have to break spec to remain 
+                                        * disconnecting, and we have to break spec to remain
                                         * compatible.
                                         */
                                case SAVE_POINTERS:
@@ -2269,31 +1953,28 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) {
                                        NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
                                        break;
                                case EXTENDED_MESSAGE:
-/* 
- * Extended messages are sent in the following format :
- * Byte         
- * 0            EXTENDED_MESSAGE == 1
- * 1            length (includes one byte for code, doesn't 
- *              include first two bytes)
- * 2            code
- * 3..length+1  arguments
- *
- * Start the extended message buffer with the EXTENDED_MESSAGE
- * byte, since spi_print_msg() wants the whole thing.  
- */
+                                       /*
+                                        * Start the message buffer with the EXTENDED_MESSAGE
+                                        * byte, since spi_print_msg() wants the whole thing.
+                                        */
                                        extended_msg[0] = EXTENDED_MESSAGE;
                                        /* Accept first byte by clearing ACK */
                                        NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-                                       dprintk(NDEBUG_EXTENDED, "scsi%d : receiving extended message\n", instance->host_no);
+
+                                       spin_unlock_irq(&hostdata->lock);
+
+                                       dsprintk(NDEBUG_EXTENDED, instance, "receiving extended message\n");
 
                                        len = 2;
                                        data = extended_msg + 1;
                                        phase = PHASE_MSGIN;
                                        NCR5380_transfer_pio(instance, &phase, &len, &data);
+                                       dsprintk(NDEBUG_EXTENDED, instance, "length %d, code 0x%02x\n",
+                                                (int)extended_msg[1],
+                                                (int)extended_msg[2]);
 
-                                       dprintk(NDEBUG_EXTENDED, "scsi%d : length=%d, code=0x%02x\n", instance->host_no, (int) extended_msg[1], (int) extended_msg[2]);
-
-                                       if (!len && extended_msg[1] <= (sizeof(extended_msg) - 1)) {
+                                       if (!len && extended_msg[1] > 0 &&
+                                           extended_msg[1] <= sizeof(extended_msg) - 2) {
                                                /* Accept third byte by clearing ACK */
                                                NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
                                                len = extended_msg[1] - 1;
@@ -2301,7 +1982,8 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) {
                                                phase = PHASE_MSGIN;
 
                                                NCR5380_transfer_pio(instance, &phase, &len, &data);
-                                               dprintk(NDEBUG_EXTENDED, "scsi%d : message received, residual %d\n", instance->host_no, len);
+                                               dsprintk(NDEBUG_EXTENDED, instance, "message received, residual %d\n",
+                                                        len);
 
                                                switch (extended_msg[2]) {
                                                case EXTENDED_SDTR:
@@ -2311,34 +1993,42 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) {
                                                        tmp = 0;
                                                }
                                        } else if (len) {
-                                               printk("scsi%d: error receiving extended message\n", instance->host_no);
+                                               shost_printk(KERN_ERR, instance, "error receiving extended message\n");
                                                tmp = 0;
                                        } else {
-                                               printk("scsi%d: extended message code %02x length %d is too long\n", instance->host_no, extended_msg[2], extended_msg[1]);
+                                               shost_printk(KERN_NOTICE, instance, "extended message code %02x length %d is too long\n",
+                                                            extended_msg[2], extended_msg[1]);
                                                tmp = 0;
                                        }
+
+                                       spin_lock_irq(&hostdata->lock);
+                                       if (!hostdata->connected)
+                                               return;
+
                                        /* Fall through to reject message */
 
-                                       /* 
-                                        * If we get something weird that we aren't expecting, 
+                                       /*
+                                        * If we get something weird that we aren't expecting,
                                         * reject it.
                                         */
                                default:
                                        if (!tmp) {
-                                               printk("scsi%d: rejecting message ", instance->host_no);
+                                               shost_printk(KERN_ERR, instance, "rejecting message ");
                                                spi_print_msg(extended_msg);
                                                printk("\n");
                                        } else if (tmp != EXTENDED_MESSAGE)
                                                scmd_printk(KERN_INFO, cmd,
-                                                       "rejecting unknown message %02x\n",tmp);
+                                                           "rejecting unknown message %02x\n",
+                                                           tmp);
                                        else
                                                scmd_printk(KERN_INFO, cmd,
-                                                       "rejecting unknown extended message code %02x, length %d\n", extended_msg[1], extended_msg[0]);
+                                                           "rejecting unknown extended message code %02x, length %d\n",
+                                                           extended_msg[1], extended_msg[0]);
 
                                        msgout = MESSAGE_REJECT;
                                        NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN);
                                        break;
-                               }       /* switch (tmp) */
+                               } /* switch (tmp) */
                                break;
                        case PHASE_MSGOUT:
                                len = 1;
@@ -2346,10 +2036,9 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) {
                                hostdata->last_message = msgout;
                                NCR5380_transfer_pio(instance, &phase, &len, &data);
                                if (msgout == ABORT) {
-                                       hostdata->busy[cmd->device->id] &= ~(1 << (cmd->device->lun & 0xFF));
                                        hostdata->connected = NULL;
                                        cmd->result = DID_ERROR << 16;
-                                       cmd->scsi_done(cmd);
+                                       complete_cmd(instance, cmd);
                                        NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
                                        return;
                                }
@@ -2358,17 +2047,12 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) {
                        case PHASE_CMDOUT:
                                len = cmd->cmd_len;
                                data = cmd->cmnd;
-                               /* 
-                                * XXX for performance reasons, on machines with a 
-                                * PSEUDO-DMA architecture we should probably 
-                                * use the dma transfer function.  
+                               /*
+                                * XXX for performance reasons, on machines with a
+                                * PSEUDO-DMA architecture we should probably
+                                * use the dma transfer function.
                                 */
                                NCR5380_transfer_pio(instance, &phase, &len, &data);
-                               if (!cmd->device->disconnect && should_disconnect(cmd->cmnd[0])) {
-                                       NCR5380_set_timer(hostdata, USLEEP_SLEEP);
-                                       dprintk(NDEBUG_USLEEP, "scsi%d : issued command, sleeping until %lu\n", instance->host_no, hostdata->time_expires);
-                                       return;
-                               }
                                break;
                        case PHASE_STATIN:
                                len = 1;
@@ -2377,46 +2061,37 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) {
                                cmd->SCp.Status = tmp;
                                break;
                        default:
-                               printk("scsi%d : unknown phase\n", instance->host_no);
+                               shost_printk(KERN_ERR, instance, "unknown phase\n");
                                NCR5380_dprint(NDEBUG_ANY, instance);
-                       }       /* switch(phase) */
-               }               /* if (tmp * SR_REQ) */
-               else {
-                       /* RvC: go to sleep if polling time expired
-                        */
-                       if (!cmd->device->disconnect && time_after_eq(jiffies, poll_time)) {
-                               NCR5380_set_timer(hostdata, USLEEP_SLEEP);
-                               dprintk(NDEBUG_USLEEP, "scsi%d : poll timed out, sleeping until %lu\n", instance->host_no, hostdata->time_expires);
-                               return;
-                       }
+                       } /* switch(phase) */
+               } else {
+                       spin_unlock_irq(&hostdata->lock);
+                       NCR5380_poll_politely(instance, STATUS_REG, SR_REQ, SR_REQ, HZ);
+                       spin_lock_irq(&hostdata->lock);
                }
-       }                       /* while (1) */
+       }
 }
 
 /*
  * Function : void NCR5380_reselect (struct Scsi_Host *instance)
  *
- * Purpose : does reselection, initializing the instance->connected 
- *      field to point to the scsi_cmnd for which the I_T_L or I_T_L_Q
- *      nexus has been reestablished,
- *      
- * Inputs : instance - this instance of the NCR5380.
+ * Purpose : does reselection, initializing the instance->connected
+ * field to point to the scsi_cmnd for which the I_T_L or I_T_L_Q
+ * nexus has been reestablished,
  *
- * Locks: io_request_lock held by caller if IRQ driven
+ * Inputs : instance - this instance of the NCR5380.
  */
 
-static void NCR5380_reselect(struct Scsi_Host *instance) {
-       NCR5380_local_declare();
-       struct NCR5380_hostdata *hostdata = (struct NCR5380_hostdata *)
-        instance->hostdata;
+static void NCR5380_reselect(struct Scsi_Host *instance)
+{
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
        unsigned char target_mask;
        unsigned char lun, phase;
        int len;
        unsigned char msg[3];
        unsigned char *data;
-       struct scsi_cmnd *tmp = NULL, *prev;
-       int abort = 0;
-       NCR5380_setup(instance);
+       struct NCR5380_cmd *ncmd;
+       struct scsi_cmnd *tmp;
 
        /*
         * Disable arbitration, etc. since the host adapter obviously
@@ -2424,12 +2099,12 @@ static void NCR5380_reselect(struct Scsi_Host *instance) {
         */
 
        NCR5380_write(MODE_REG, MR_BASE);
-       hostdata->restart_select = 1;
 
        target_mask = NCR5380_read(CURRENT_SCSI_DATA_REG) & ~(hostdata->id_mask);
-       dprintk(NDEBUG_SELECTION, "scsi%d : reselect\n", instance->host_no);
 
-       /* 
+       dsprintk(NDEBUG_RESELECTION, instance, "reselect\n");
+
+       /*
         * At this point, we have detected that our SCSI ID is on the bus,
         * SEL is true and BSY was false for at least one bus settle delay
         * (400 ns).
@@ -2439,103 +2114,110 @@ static void NCR5380_reselect(struct Scsi_Host *instance) {
         */
 
        NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_BSY);
-
-       /* FIXME: timeout too long, must fail to workqueue */   
-       if(NCR5380_poll_politely(instance, STATUS_REG, SR_SEL, 0, 2*HZ)<0)
-               abort = 1;
-               
+       if (NCR5380_poll_politely(instance,
+                                 STATUS_REG, SR_SEL, 0, 2 * HZ) < 0) {
+               NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
+               return;
+       }
        NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
 
        /*
         * Wait for target to go into MSGIN.
-        * FIXME: timeout needed and fail to work queeu
         */
 
-       if(NCR5380_poll_politely(instance, STATUS_REG, SR_REQ, SR_REQ, 2*HZ))
-               abort = 1;
+       if (NCR5380_poll_politely(instance,
+                                 STATUS_REG, SR_REQ, SR_REQ, 2 * HZ) < 0) {
+               do_abort(instance);
+               return;
+       }
 
        len = 1;
        data = msg;
        phase = PHASE_MSGIN;
        NCR5380_transfer_pio(instance, &phase, &len, &data);
 
+       if (len) {
+               do_abort(instance);
+               return;
+       }
+
        if (!(msg[0] & 0x80)) {
-               printk(KERN_ERR "scsi%d : expecting IDENTIFY message, got ", instance->host_no);
+               shost_printk(KERN_ERR, instance, "expecting IDENTIFY message, got ");
                spi_print_msg(msg);
-               abort = 1;
-       } else {
-               /* Accept message by clearing ACK */
-               NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-               lun = (msg[0] & 0x07);
+               printk("\n");
+               do_abort(instance);
+               return;
+       }
+       lun = msg[0] & 0x07;
 
-               /* 
-                * We need to add code for SCSI-II to track which devices have
-                * I_T_L_Q nexuses established, and which have simple I_T_L
-                * nexuses so we can chose to do additional data transfer.
-                */
+       /*
+        * We need to add code for SCSI-II to track which devices have
+        * I_T_L_Q nexuses established, and which have simple I_T_L
+        * nexuses so we can chose to do additional data transfer.
+        */
 
-               /* 
-                * Find the command corresponding to the I_T_L or I_T_L_Q  nexus we 
-                * just reestablished, and remove it from the disconnected queue.
-                */
+       /*
+        * Find the command corresponding to the I_T_L or I_T_L_Q  nexus we
+        * just reestablished, and remove it from the disconnected queue.
+        */
 
+       tmp = NULL;
+       list_for_each_entry(ncmd, &hostdata->disconnected, list) {
+               struct scsi_cmnd *cmd = NCR5380_to_scmd(ncmd);
 
-               for (tmp = (struct scsi_cmnd *) hostdata->disconnected_queue, prev = NULL; tmp; prev = tmp, tmp = (struct scsi_cmnd *) tmp->host_scribble)
-                       if ((target_mask == (1 << tmp->device->id)) && (lun == (u8)tmp->device->lun)
-                           ) {
-                               if (prev) {
-                                       REMOVE(prev, prev->host_scribble, tmp, tmp->host_scribble);
-                                       prev->host_scribble = tmp->host_scribble;
-                               } else {
-                                       REMOVE(-1, hostdata->disconnected_queue, tmp, tmp->host_scribble);
-                                       hostdata->disconnected_queue = (struct scsi_cmnd *) tmp->host_scribble;
-                               }
-                               tmp->host_scribble = NULL;
-                               break;
-                       }
-               if (!tmp) {
-                       printk(KERN_ERR "scsi%d : warning : target bitmask %02x lun %d not in disconnect_queue.\n", instance->host_no, target_mask, lun);
-                       /* 
-                        * Since we have an established nexus that we can't do anything with,
-                        * we must abort it.  
-                        */
-                       abort = 1;
+               if (target_mask == (1 << scmd_id(cmd)) &&
+                   lun == (u8)cmd->device->lun) {
+                       list_del(&ncmd->list);
+                       tmp = cmd;
+                       break;
                }
        }
 
-       if (abort) {
-               do_abort(instance);
+       if (tmp) {
+               dsprintk(NDEBUG_RESELECTION | NDEBUG_QUEUES, instance,
+                        "reselect: removed %p from disconnected queue\n", tmp);
        } else {
-               hostdata->connected = tmp;
-               dprintk(NDEBUG_RESELECTION, "scsi%d : nexus established, target = %d, lun = %llu, tag = %d\n", instance->host_no, tmp->device->id, tmp->device->lun, tmp->tag);
+               shost_printk(KERN_ERR, instance, "target bitmask 0x%02x lun %d not in disconnected queue.\n",
+                            target_mask, lun);
+               /*
+                * Since we have an established nexus that we can't do anything
+                * with, we must abort it.
+                */
+               do_abort(instance);
+               return;
        }
+
+       /* Accept message by clearing ACK */
+       NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
+
+       hostdata->connected = tmp;
+       dsprintk(NDEBUG_RESELECTION, instance, "nexus established, target %d, lun %llu, tag %d\n",
+                scmd_id(tmp), tmp->device->lun, tmp->tag);
 }
 
 /*
  * Function : void NCR5380_dma_complete (struct Scsi_Host *instance)
  *
  * Purpose : called by interrupt handler when DMA finishes or a phase
- *      mismatch occurs (which would finish the DMA transfer).  
+ * mismatch occurs (which would finish the DMA transfer).
  *
  * Inputs : instance - this instance of the NCR5380.
  *
  * Returns : pointer to the scsi_cmnd structure for which the I_T_L
- *      nexus has been reestablished, on failure NULL is returned.
+ * nexus has been reestablished, on failure NULL is returned.
  */
 
 #ifdef REAL_DMA
 static void NCR5380_dma_complete(NCR5380_instance * instance) {
-       NCR5380_local_declare();
-       struct NCR5380_hostdata *hostdata = (struct NCR5380_hostdata *) instance->hostdata;
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
        int transferred;
-       NCR5380_setup(instance);
 
        /*
         * XXX this might not be right.
         *
         * Wait for final byte to transfer, ie wait for ACK to go false.
         *
-        * We should use the Last Byte Sent bit, unfortunately this is 
+        * We should use the Last Byte Sent bit, unfortunately this is
         * not available on the 5380/5381 (only the various CMOS chips)
         *
         * FIXME: timeout, and need to handle long timeout/irq case
@@ -2543,7 +2225,6 @@ static void NCR5380_dma_complete(NCR5380_instance * instance) {
 
        NCR5380_poll_politely(instance, BUS_AND_STATUS_REG, BASR_ACK, 0, 5*HZ);
 
-       NCR5380_write(MODE_REG, MR_BASE);
        NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
 
        /*
@@ -2560,190 +2241,251 @@ static void NCR5380_dma_complete(NCR5380_instance * instance) {
 }
 #endif                         /* def REAL_DMA */
 
-/*
- * Function : int NCR5380_abort (struct scsi_cmnd *cmd)
- *
- * Purpose : abort a command
- *
- * Inputs : cmd - the scsi_cmnd to abort, code - code to set the
- *      host byte of the result field to, if zero DID_ABORTED is
- *      used.
- *
- * Returns : SUCCESS - success, FAILED on failure.
- *
- *     XXX - there is no way to abort the command that is currently
- *     connected, you have to wait for it to complete.  If this is
- *     a problem, we could implement longjmp() / setjmp(), setjmp()
- *     called where the loop started in NCR5380_main().
- *
- * Locks: host lock taken by caller
+/**
+ * list_find_cmd - test for presence of a command in a linked list
+ * @haystack: list of commands
+ * @needle: command to search for
  */
 
-static int NCR5380_abort(struct scsi_cmnd *cmd)
+static bool list_find_cmd(struct list_head *haystack,
+                          struct scsi_cmnd *needle)
 {
-       NCR5380_local_declare();
-       struct Scsi_Host *instance = cmd->device->host;
-       struct NCR5380_hostdata *hostdata = (struct NCR5380_hostdata *) instance->hostdata;
-       struct scsi_cmnd *tmp, **prev;
+       struct NCR5380_cmd *ncmd;
 
-       scmd_printk(KERN_WARNING, cmd, "aborting command\n");
+       list_for_each_entry(ncmd, haystack, list)
+               if (NCR5380_to_scmd(ncmd) == needle)
+                       return true;
+       return false;
+}
 
-       NCR5380_print_status(instance);
+/**
+ * list_remove_cmd - remove a command from linked list
+ * @haystack: list of commands
+ * @needle: command to remove
+ */
 
-       NCR5380_setup(instance);
+static bool list_del_cmd(struct list_head *haystack,
+                         struct scsi_cmnd *needle)
+{
+       if (list_find_cmd(haystack, needle)) {
+               struct NCR5380_cmd *ncmd = scsi_cmd_priv(needle);
 
-       dprintk(NDEBUG_ABORT, "scsi%d : abort called\n", instance->host_no);
-       dprintk(NDEBUG_ABORT, "        basr 0x%X, sr 0x%X\n", NCR5380_read(BUS_AND_STATUS_REG), NCR5380_read(STATUS_REG));
+               list_del(&ncmd->list);
+               return true;
+       }
+       return false;
+}
 
-#if 0
-/*
- * Case 1 : If the command is the currently executing command, 
- * we'll set the aborted flag and return control so that 
- * information transfer routine can exit cleanly.
+/**
+ * NCR5380_abort - scsi host eh_abort_handler() method
+ * @cmd: the command to be aborted
+ *
+ * Try to abort a given command by removing it from queues and/or sending
+ * the target an abort message. This may not succeed in causing a target
+ * to abort the command. Nonetheless, the low-level driver must forget about
+ * the command because the mid-layer reclaims it and it may be re-issued.
+ *
+ * The normal path taken by a command is as follows. For EH we trace this
+ * same path to locate and abort the command.
+ *
+ * unissued -> selecting -> [unissued -> selecting ->]... connected ->
+ * [disconnected -> connected ->]...
+ * [autosense -> connected ->] done
+ *
+ * If cmd is unissued then just remove it.
+ * If cmd is disconnected, try to select the target.
+ * If cmd is connected, try to send an abort message.
+ * If cmd is waiting for autosense, give it a chance to complete but check
+ * that it isn't left connected.
+ * If cmd was not found at all then presumably it has already been completed,
+ * in which case return SUCCESS to try to avoid further EH measures.
+ * If the command has not completed yet, we must not fail to find it.
  */
 
-       if (hostdata->connected == cmd) {
-               dprintk(NDEBUG_ABORT, "scsi%d : aborting connected command\n", instance->host_no);
-               hostdata->aborted = 1;
-/*
- * We should perform BSY checking, and make sure we haven't slipped
- * into BUS FREE.
- */
+static int NCR5380_abort(struct scsi_cmnd *cmd)
+{
+       struct Scsi_Host *instance = cmd->device->host;
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
+       unsigned long flags;
+       int result = SUCCESS;
 
-               NCR5380_write(INITIATOR_COMMAND_REG, ICR_ASSERT_ATN);
-/* 
- * Since we can't change phases until we've completed the current 
- * handshake, we have to source or sink a byte of data if the current
- * phase is not MSGOUT.
- */
+       spin_lock_irqsave(&hostdata->lock, flags);
 
-/* 
- * Return control to the executing NCR drive so we can clear the
- * aborted flag and get back into our main loop.
- */
+#if (NDEBUG & NDEBUG_ANY)
+       scmd_printk(KERN_INFO, cmd, __func__);
+#endif
+       NCR5380_dprint(NDEBUG_ANY, instance);
+       NCR5380_dprint_phase(NDEBUG_ANY, instance);
 
-               return SUCCESS;
+       if (list_del_cmd(&hostdata->unissued, cmd)) {
+               dsprintk(NDEBUG_ABORT, instance,
+                        "abort: removed %p from issue queue\n", cmd);
+               cmd->result = DID_ABORT << 16;
+               cmd->scsi_done(cmd); /* No tag or busy flag to worry about */
        }
-#endif
 
-/* 
- * Case 2 : If the command hasn't been issued yet, we simply remove it 
- *          from the issue queue.
- */
-       dprintk(NDEBUG_ABORT, "scsi%d : abort going into loop.\n", instance->host_no);
-       for (prev = (struct scsi_cmnd **) &(hostdata->issue_queue), tmp = (struct scsi_cmnd *) hostdata->issue_queue; tmp; prev = (struct scsi_cmnd **) &(tmp->host_scribble), tmp = (struct scsi_cmnd *) tmp->host_scribble)
-               if (cmd == tmp) {
-                       REMOVE(5, *prev, tmp, tmp->host_scribble);
-                       (*prev) = (struct scsi_cmnd *) tmp->host_scribble;
-                       tmp->host_scribble = NULL;
-                       tmp->result = DID_ABORT << 16;
-                       dprintk(NDEBUG_ABORT, "scsi%d : abort removed command from issue queue.\n", instance->host_no);
-                       tmp->scsi_done(tmp);
-                       return SUCCESS;
+       if (hostdata->selecting == cmd) {
+               dsprintk(NDEBUG_ABORT, instance,
+                        "abort: cmd %p == selecting\n", cmd);
+               hostdata->selecting = NULL;
+               cmd->result = DID_ABORT << 16;
+               complete_cmd(instance, cmd);
+               goto out;
+       }
+
+       if (list_del_cmd(&hostdata->disconnected, cmd)) {
+               dsprintk(NDEBUG_ABORT, instance,
+                        "abort: removed %p from disconnected list\n", cmd);
+               cmd->result = DID_ERROR << 16;
+               if (!hostdata->connected)
+                       NCR5380_select(instance, cmd);
+               if (hostdata->connected != cmd) {
+                       complete_cmd(instance, cmd);
+                       result = FAILED;
+                       goto out;
+               }
+       }
+
+       if (hostdata->connected == cmd) {
+               dsprintk(NDEBUG_ABORT, instance, "abort: cmd %p is connected\n", cmd);
+               hostdata->connected = NULL;
+               if (do_abort(instance)) {
+                       set_host_byte(cmd, DID_ERROR);
+                       complete_cmd(instance, cmd);
+                       result = FAILED;
+                       goto out;
                }
-#if (NDEBUG  & NDEBUG_ABORT)
-       /* KLL */
-               else if (prev == tmp)
-                       printk(KERN_ERR "scsi%d : LOOP\n", instance->host_no);
+               set_host_byte(cmd, DID_ABORT);
+#ifdef REAL_DMA
+               hostdata->dma_len = 0;
 #endif
+               if (cmd->cmnd[0] == REQUEST_SENSE)
+                       complete_cmd(instance, cmd);
+               else {
+                       struct NCR5380_cmd *ncmd = scsi_cmd_priv(cmd);
 
-/* 
- * Case 3 : If any commands are connected, we're going to fail the abort
- *          and let the high level SCSI driver retry at a later time or 
- *          issue a reset.
- *
- *          Timeouts, and therefore aborted commands, will be highly unlikely
- *          and handling them cleanly in this situation would make the common
- *          case of noresets less efficient, and would pollute our code.  So,
- *          we fail.
- */
+                       /* Perform autosense for this command */
+                       list_add(&ncmd->list, &hostdata->autosense);
+               }
+       }
 
-       if (hostdata->connected) {
-               dprintk(NDEBUG_ABORT, "scsi%d : abort failed, command connected.\n", instance->host_no);
-               return FAILED;
+       if (list_find_cmd(&hostdata->autosense, cmd)) {
+               dsprintk(NDEBUG_ABORT, instance,
+                        "abort: found %p on sense queue\n", cmd);
+               spin_unlock_irqrestore(&hostdata->lock, flags);
+               queue_work(hostdata->work_q, &hostdata->main_task);
+               msleep(1000);
+               spin_lock_irqsave(&hostdata->lock, flags);
+               if (list_del_cmd(&hostdata->autosense, cmd)) {
+                       dsprintk(NDEBUG_ABORT, instance,
+                                "abort: removed %p from sense queue\n", cmd);
+                       set_host_byte(cmd, DID_ABORT);
+                       complete_cmd(instance, cmd);
+                       goto out;
+               }
        }
-/*
- * Case 4: If the command is currently disconnected from the bus, and 
- *      there are no connected commands, we reconnect the I_T_L or 
- *      I_T_L_Q nexus associated with it, go into message out, and send 
- *      an abort message.
- *
- * This case is especially ugly. In order to reestablish the nexus, we
- * need to call NCR5380_select().  The easiest way to implement this 
- * function was to abort if the bus was busy, and let the interrupt
- * handler triggered on the SEL for reselect take care of lost arbitrations
- * where necessary, meaning interrupts need to be enabled.
- *
- * When interrupts are enabled, the queues may change - so we 
- * can't remove it from the disconnected queue before selecting it
- * because that could cause a failure in hashing the nexus if that 
- * device reselected.
- * 
- * Since the queues may change, we can't use the pointers from when we
- * first locate it.
- *
- * So, we must first locate the command, and if NCR5380_select()
- * succeeds, then issue the abort, relocate the command and remove
- * it from the disconnected queue.
- */
 
-       for (tmp = (struct scsi_cmnd *) hostdata->disconnected_queue; tmp; tmp = (struct scsi_cmnd *) tmp->host_scribble)
-               if (cmd == tmp) {
-                       dprintk(NDEBUG_ABORT, "scsi%d : aborting disconnected command.\n", instance->host_no);
+       if (hostdata->connected == cmd) {
+               dsprintk(NDEBUG_ABORT, instance, "abort: cmd %p is connected\n", cmd);
+               hostdata->connected = NULL;
+               if (do_abort(instance)) {
+                       set_host_byte(cmd, DID_ERROR);
+                       complete_cmd(instance, cmd);
+                       result = FAILED;
+                       goto out;
+               }
+               set_host_byte(cmd, DID_ABORT);
+#ifdef REAL_DMA
+               hostdata->dma_len = 0;
+#endif
+               complete_cmd(instance, cmd);
+       }
 
-                       if (NCR5380_select(instance, cmd))
-                               return FAILED;
-                       dprintk(NDEBUG_ABORT, "scsi%d : nexus reestablished.\n", instance->host_no);
+out:
+       if (result == FAILED)
+               dsprintk(NDEBUG_ABORT, instance, "abort: failed to abort %p\n", cmd);
+       else
+               dsprintk(NDEBUG_ABORT, instance, "abort: successfully aborted %p\n", cmd);
 
-                       do_abort(instance);
+       queue_work(hostdata->work_q, &hostdata->main_task);
+       spin_unlock_irqrestore(&hostdata->lock, flags);
 
-                       for (prev = (struct scsi_cmnd **) &(hostdata->disconnected_queue), tmp = (struct scsi_cmnd *) hostdata->disconnected_queue; tmp; prev = (struct scsi_cmnd **) &(tmp->host_scribble), tmp = (struct scsi_cmnd *) tmp->host_scribble)
-                               if (cmd == tmp) {
-                                       REMOVE(5, *prev, tmp, tmp->host_scribble);
-                                       *prev = (struct scsi_cmnd *) tmp->host_scribble;
-                                       tmp->host_scribble = NULL;
-                                       tmp->result = DID_ABORT << 16;
-                                       tmp->scsi_done(tmp);
-                                       return SUCCESS;
-                               }
-               }
-/*
- * Case 5 : If we reached this point, the command was not found in any of 
- *          the queues.
- *
- * We probably reached this point because of an unlikely race condition
- * between the command completing successfully and the abortion code,
- * so we won't panic, but we will notify the user in case something really
- * broke.
- */
-       printk(KERN_WARNING "scsi%d : warning : SCSI command probably completed successfully\n"
-                       "         before abortion\n", instance->host_no);
-       return FAILED;
+       return result;
 }
 
 
-/* 
- * Function : int NCR5380_bus_reset (struct scsi_cmnd *cmd)
- * 
- * Purpose : reset the SCSI bus.
- *
- * Returns : SUCCESS
+/**
+ * NCR5380_bus_reset - reset the SCSI bus
+ * @cmd: SCSI command undergoing EH
  *
- * Locks: host lock taken by caller
+ * Returns SUCCESS
  */
 
 static int NCR5380_bus_reset(struct scsi_cmnd *cmd)
 {
        struct Scsi_Host *instance = cmd->device->host;
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
+       int i;
+       unsigned long flags;
+       struct NCR5380_cmd *ncmd;
 
-       NCR5380_local_declare();
-       NCR5380_setup(instance);
-       NCR5380_print_status(instance);
+       spin_lock_irqsave(&hostdata->lock, flags);
+
+#if (NDEBUG & NDEBUG_ANY)
+       scmd_printk(KERN_INFO, cmd, __func__);
+#endif
+       NCR5380_dprint(NDEBUG_ANY, instance);
+       NCR5380_dprint_phase(NDEBUG_ANY, instance);
 
-       spin_lock_irq(instance->host_lock);
        do_reset(instance);
-       spin_unlock_irq(instance->host_lock);
+
+       /* reset NCR registers */
+       NCR5380_write(MODE_REG, MR_BASE);
+       NCR5380_write(TARGET_COMMAND_REG, 0);
+       NCR5380_write(SELECT_ENABLE_REG, 0);
+
+       /* After the reset, there are no more connected or disconnected commands
+        * and no busy units; so clear the low-level status here to avoid
+        * conflicts when the mid-level code tries to wake up the affected
+        * commands!
+        */
+
+       hostdata->selecting = NULL;
+
+       list_for_each_entry(ncmd, &hostdata->disconnected, list) {
+               struct scsi_cmnd *cmd = NCR5380_to_scmd(ncmd);
+
+               set_host_byte(cmd, DID_RESET);
+               cmd->scsi_done(cmd);
+       }
+
+       list_for_each_entry(ncmd, &hostdata->autosense, list) {
+               struct scsi_cmnd *cmd = NCR5380_to_scmd(ncmd);
+
+               set_host_byte(cmd, DID_RESET);
+               cmd->scsi_done(cmd);
+       }
+
+       if (hostdata->connected) {
+               set_host_byte(hostdata->connected, DID_RESET);
+               complete_cmd(instance, hostdata->connected);
+               hostdata->connected = NULL;
+       }
+
+       if (hostdata->sensing) {
+               set_host_byte(hostdata->connected, DID_RESET);
+               complete_cmd(instance, hostdata->sensing);
+               hostdata->sensing = NULL;
+       }
+
+       for (i = 0; i < 8; ++i)
+               hostdata->busy[i] = 0;
+#ifdef REAL_DMA
+       hostdata->dma_len = 0;
+#endif
+
+       queue_work(hostdata->work_q, &hostdata->main_task);
+       spin_unlock_irqrestore(&hostdata->lock, flags);
 
        return SUCCESS;
 }
index 162112dd1bf8eabeaf0c8bb029291742d6c55ce4..a79288682a74ad91aad59b3ec8f64242337e3713 100644 (file)
 #ifndef NCR5380_H
 #define NCR5380_H
 
+#include <linux/delay.h>
 #include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <scsi/scsi_dbg.h>
 #include <scsi/scsi_eh.h>
+#include <scsi/scsi_transport_spi.h>
 
 #define NDEBUG_ARBITRATION     0x1
 #define NDEBUG_AUTOSENSE       0x2
 /* Write any value to this register to start an ini mode DMA receive */
 #define START_DMA_INITIATOR_RECEIVE_REG 7      /* wo */
 
-#define C400_CONTROL_STATUS_REG NCR53C400_register_offset-8    /* rw */
-
+/* NCR 53C400(A) Control Status Register bits: */
 #define CSR_RESET              0x80    /* wo  Resets 53c400 */
 #define CSR_53C80_REG          0x80    /* ro  5380 registers busy */
 #define CSR_TRANS_DIR          0x40    /* rw  Data transfer direction */
 #define CSR_BASE CSR_53C80_INTR
 #endif
 
-/* Number of 128-byte blocks to be transferred */
-#define C400_BLOCK_COUNTER_REG   NCR53C400_register_offset-7   /* rw */
-
-/* Resume transfer after disconnect */
-#define C400_RESUME_TRANSFER_REG NCR53C400_register_offset-6   /* wo */
-
-/* Access to host buffer stack */
-#define C400_HOST_BUFFER         NCR53C400_register_offset-4   /* rw */
-
-
 /* Note : PHASE_* macros are based on the values of the STATUS register */
 #define PHASE_MASK     (SR_MSG | SR_CD | SR_IO)
 
 
 #define PHASE_SR_TO_TCR(phase) ((phase) >> 2)
 
-/*
- * The internal should_disconnect() function returns these based on the 
- * expected length of a disconnect if a device supports disconnect/
- * reconnect.
- */
-
-#define DISCONNECT_NONE                0
-#define DISCONNECT_TIME_TO_DATA        1
-#define DISCONNECT_LONG                2
-
 /* 
  * "Special" value for the (unsigned char) command tag, to indicate
  * I_T_L nexus instead of I_T_L_Q.
 #define NO_IRQ         0
 #endif
 
-#define FLAG_HAS_LAST_BYTE_SENT                1       /* NCR53c81 or better */
-#define FLAG_CHECK_LAST_BYTE_SENT      2       /* Only test once */
-#define FLAG_NCR53C400                 4       /* NCR53c400 */
+#define FLAG_NO_DMA_FIXUP              1       /* No DMA errata workarounds */
 #define FLAG_NO_PSEUDO_DMA             8       /* Inhibit DMA */
-#define FLAG_DTC3181E                  16      /* DTC3181E */
 #define FLAG_LATE_DMA_SETUP            32      /* Setup NCR before DMA H/W */
 #define FLAG_TAGGED_QUEUING            64      /* as X3T9.2 spelled it */
-
-#ifndef ASM
+#define FLAG_TOSHIBA_DELAY             128     /* Allow for borken CD-ROMs */
 
 #ifdef SUPPORT_TAGS
 struct tag_alloc {
@@ -258,33 +238,24 @@ struct NCR5380_hostdata {
        NCR5380_implementation_fields;          /* implementation specific */
        struct Scsi_Host *host;                 /* Host backpointer */
        unsigned char id_mask, id_higher_mask;  /* 1 << id, all bits greater */
-       unsigned char targets_present;          /* targets we have connected
-                                                  to, so we can call a select
-                                                  failure a retryable condition */
-       volatile unsigned char busy[8];         /* index = target, bit = lun */
+       unsigned char busy[8];                  /* index = target, bit = lun */
 #if defined(REAL_DMA) || defined(REAL_DMA_POLL)
-       volatile int dma_len;                   /* requested length of DMA */
+       int dma_len;                            /* requested length of DMA */
 #endif
-       volatile unsigned char last_message;    /* last message OUT */
-       volatile struct scsi_cmnd *connected;   /* currently connected command */
-       volatile struct scsi_cmnd *issue_queue; /* waiting to be issued */
-       volatile struct scsi_cmnd *disconnected_queue;  /* waiting for reconnect */
-       volatile int restart_select;            /* we have disconnected,
-                                                  used to restart 
-                                                  NCR5380_select() */
-       volatile unsigned aborted:1;            /* flag, says aborted */
+       unsigned char last_message;             /* last message OUT */
+       struct scsi_cmnd *connected;            /* currently connected cmnd */
+       struct scsi_cmnd *selecting;            /* cmnd to be connected */
+       struct list_head unissued;              /* waiting to be issued */
+       struct list_head autosense;             /* priority issue queue */
+       struct list_head disconnected;          /* waiting for reconnect */
+       spinlock_t lock;                        /* protects this struct */
        int flags;
-       unsigned long time_expires;             /* in jiffies, set prior to sleeping */
-       int select_time;                        /* timer in select for target response */
-       volatile struct scsi_cmnd *selecting;
-       struct delayed_work coroutine;          /* our co-routine */
        struct scsi_eh_save ses;
+       struct scsi_cmnd *sensing;
        char info[256];
        int read_overruns;                /* number of bytes to cut from a
                                           * transfer to handle chip overruns */
-       int retain_dma_intr;
        struct work_struct main_task;
-       volatile int main_running;
 #ifdef SUPPORT_TAGS
        struct tag_alloc TagAlloc[8][8];        /* 8 targets and 8 LUNs */
 #endif
@@ -292,10 +263,23 @@ struct NCR5380_hostdata {
        unsigned spin_max_r;
        unsigned spin_max_w;
 #endif
+       struct workqueue_struct *work_q;
+       unsigned long accesses_per_ms;  /* chip register accesses per ms */
 };
 
 #ifdef __KERNEL__
 
+struct NCR5380_cmd {
+       struct list_head list;
+};
+
+#define NCR5380_CMD_SIZE               (sizeof(struct NCR5380_cmd))
+
+static inline struct scsi_cmnd *NCR5380_to_scmd(struct NCR5380_cmd *ncmd_ptr)
+{
+       return ((struct scsi_cmnd *)ncmd_ptr) - 1;
+}
+
 #ifndef NDEBUG
 #define NDEBUG (0)
 #endif
@@ -304,6 +288,11 @@ struct NCR5380_hostdata {
        do { if ((NDEBUG) & (flg)) \
                printk(KERN_DEBUG fmt, ## __VA_ARGS__); } while (0)
 
+#define dsprintk(flg, host, fmt, ...) \
+       do { if ((NDEBUG) & (flg)) \
+               shost_printk(KERN_DEBUG, host, fmt, ## __VA_ARGS__); \
+       } while (0)
+
 #if NDEBUG
 #define NCR5380_dprint(flg, arg) \
        do { if ((NDEBUG) & (flg)) NCR5380_print(arg); } while (0)
@@ -320,6 +309,7 @@ static void NCR5380_print(struct Scsi_Host *instance);
 static int NCR5380_probe_irq(struct Scsi_Host *instance, int possible);
 #endif
 static int NCR5380_init(struct Scsi_Host *instance, int flags);
+static int NCR5380_maybe_reset_bus(struct Scsi_Host *);
 static void NCR5380_exit(struct Scsi_Host *instance);
 static void NCR5380_information_transfer(struct Scsi_Host *instance);
 #ifndef DONT_USE_INTR
@@ -328,7 +318,7 @@ static irqreturn_t NCR5380_intr(int irq, void *dev_id);
 static void NCR5380_main(struct work_struct *work);
 static const char *NCR5380_info(struct Scsi_Host *instance);
 static void NCR5380_reselect(struct Scsi_Host *instance);
-static int NCR5380_select(struct Scsi_Host *instance, struct scsi_cmnd *cmd);
+static struct scsi_cmnd *NCR5380_select(struct Scsi_Host *, struct scsi_cmnd *);
 #if defined(PSEUDO_DMA) || defined(REAL_DMA) || defined(REAL_DMA_POLL)
 static int NCR5380_transfer_dma(struct Scsi_Host *instance, unsigned char *phase, int *count, unsigned char **data);
 #endif
@@ -443,5 +433,4 @@ static __inline__ int NCR5380_pc_dma_residual(struct Scsi_Host *instance)
 #endif                         /* defined(i386) || defined(__alpha__) */
 #endif                         /* defined(REAL_DMA)  */
 #endif                         /* __KERNEL__ */
-#endif                         /* ndef ASM */
 #endif                         /* NCR5380_H */
index d28d6c0f18c0b76acd384f53ed6132a54f13fa57..221f18c5df9363c704d8289c2baab4b10dd89db6 100644 (file)
@@ -4,9 +4,7 @@
  * Copyright 1995-2002, Russell King
  */
 #include <linux/module.h>
-#include <linux/signal.h>
 #include <linux/ioport.h>
-#include <linux/delay.h>
 #include <linux/blkdev.h>
 #include <linux/init.h>
 
 
 #include <scsi/scsi_host.h>
 
-#include <scsi/scsicam.h>
-
 #define PSEUDO_DMA
 
 #define priv(host)                     ((struct NCR5380_hostdata *)(host)->hostdata)
-#define NCR5380_local_declare()                struct Scsi_Host *_instance
-#define NCR5380_setup(instance)                _instance = instance
-#define NCR5380_read(reg)              cumanascsi_read(_instance, reg)
-#define NCR5380_write(reg, value)      cumanascsi_write(_instance, reg, value)
+#define NCR5380_read(reg)              cumanascsi_read(instance, reg)
+#define NCR5380_write(reg, value)      cumanascsi_write(instance, reg, value)
+
+#define NCR5380_dma_xfer_len(instance, cmd, phase)     (cmd->transfersize)
+
 #define NCR5380_intr                   cumanascsi_intr
 #define NCR5380_queue_command          cumanascsi_queue_command
 #define NCR5380_info                   cumanascsi_info
@@ -211,6 +208,8 @@ static struct scsi_host_template cumanascsi_template = {
        .cmd_per_lun            = 2,
        .use_clustering         = DISABLE_CLUSTERING,
        .proc_name              = "CumanaSCSI-1",
+       .cmd_size               = NCR5380_CMD_SIZE,
+       .max_sectors            = 128,
 };
 
 static int cumanascsi1_probe(struct expansion_card *ec,
@@ -240,23 +239,21 @@ static int cumanascsi1_probe(struct expansion_card *ec,
 
        host->irq = ec->irq;
 
-       NCR5380_init(host, 0);
+       ret = NCR5380_init(host, 0);
+       if (ret)
+               goto out_unmap;
+
+       NCR5380_maybe_reset_bus(host);
 
         priv(host)->ctrl = 0;
         writeb(0, priv(host)->base + CTRL);
 
-       host->n_io_port = 255;
-       if (!(request_region(host->io_port, host->n_io_port, "CumanaSCSI-1"))) {
-               ret = -EBUSY;
-               goto out_unmap;
-       }
-
        ret = request_irq(host->irq, cumanascsi_intr, 0,
                          "CumanaSCSI-1", host);
        if (ret) {
                printk("scsi%d: IRQ%d not free: %d\n",
                    host->host_no, host->irq, ret);
-               goto out_unmap;
+               goto out_exit;
        }
 
        ret = scsi_add_host(host, &ec->dev);
@@ -268,6 +265,8 @@ static int cumanascsi1_probe(struct expansion_card *ec,
 
  out_free_irq:
        free_irq(host->irq, host);
+ out_exit:
+       NCR5380_exit(host);
  out_unmap:
        iounmap(priv(host)->base);
        iounmap(priv(host)->dma);
index 7c6fa1479c9c1f4c92cd98c891e800d8b7a12ecb..1fab1d1896b1a7e81a53d2dc664e6f52a712b6d4 100644 (file)
@@ -5,9 +5,7 @@
  */
 
 #include <linux/module.h>
-#include <linux/signal.h>
 #include <linux/ioport.h>
-#include <linux/delay.h>
 #include <linux/blkdev.h>
 #include <linux/init.h>
 
 #define DONT_USE_INTR
 
 #define priv(host)                     ((struct NCR5380_hostdata *)(host)->hostdata)
-#define NCR5380_local_declare()                void __iomem *_base
-#define NCR5380_setup(host)            _base = priv(host)->base
 
-#define NCR5380_read(reg)              readb(_base + ((reg) << 2))
-#define NCR5380_write(reg, value)      writeb(value, _base + ((reg) << 2))
+#define NCR5380_read(reg) \
+       readb(priv(instance)->base + ((reg) << 2))
+#define NCR5380_write(reg, value) \
+       writeb(value, priv(instance)->base + ((reg) << 2))
+
+#define NCR5380_dma_xfer_len(instance, cmd, phase)     (cmd->transfersize)
+
 #define NCR5380_queue_command          oakscsi_queue_command
 #define NCR5380_info                   oakscsi_info
-#define NCR5380_show_info              oakscsi_show_info
 
 #define NCR5380_implementation_fields  \
        void __iomem *base
@@ -103,7 +103,6 @@ printk("reading %p len %d\n", addr, len);
 
 static struct scsi_host_template oakscsi_template = {
        .module                 = THIS_MODULE,
-       .show_info              = oakscsi_show_info,
        .name                   = "Oak 16-bit SCSI",
        .info                   = oakscsi_info,
        .queuecommand           = oakscsi_queue_command,
@@ -115,6 +114,8 @@ static struct scsi_host_template oakscsi_template = {
        .cmd_per_lun            = 2,
        .use_clustering         = DISABLE_CLUSTERING,
        .proc_name              = "oakscsi",
+       .cmd_size               = NCR5380_CMD_SIZE,
+       .max_sectors            = 128,
 };
 
 static int oakscsi_probe(struct expansion_card *ec, const struct ecard_id *id)
@@ -142,15 +143,21 @@ static int oakscsi_probe(struct expansion_card *ec, const struct ecard_id *id)
        host->irq = NO_IRQ;
        host->n_io_port = 255;
 
-       NCR5380_init(host, 0);
+       ret = NCR5380_init(host, 0);
+       if (ret)
+               goto out_unmap;
+
+       NCR5380_maybe_reset_bus(host);
 
        ret = scsi_add_host(host, &ec->dev);
        if (ret)
-               goto out_unmap;
+               goto out_exit;
 
        scsi_scan_host(host);
        goto out;
 
+ out_exit:
+       NCR5380_exit(host);
  out_unmap:
        iounmap(priv(host)->base);
  unreg:
index db87ece6edb2cd317d463c1afb21f7b9a9942b33..e65478651ca94030ac2b5d2671eecc7e7d4456e3 100644 (file)
@@ -1,15 +1,15 @@
 /*
  * NCR 5380 generic driver routines.  These should make it *trivial*
- *     to implement 5380 SCSI drivers under Linux with a non-trantor
- *     architecture.
+ * to implement 5380 SCSI drivers under Linux with a non-trantor
+ * architecture.
  *
- *     Note that these routines also work with NR53c400 family chips.
+ * Note that these routines also work with NR53c400 family chips.
  *
  * Copyright 1993, Drew Eckhardt
- *     Visionary Computing
- *     (Unix and Linux consulting and custom programming)
- *     drew@colorado.edu
- *     +1 (303) 666-5836
+ * Visionary Computing
+ * (Unix and Linux consulting and custom programming)
+ * drew@colorado.edu
+ * +1 (303) 666-5836
  *
  * For more information, please consult
  *
  * 1+ (800) 334-5454
  */
 
-/*
- * ++roman: To port the 5380 driver to the Atari, I had to do some changes in
- * this file, too:
- *
- *  - Some of the debug statements were incorrect (undefined variables and the
- *    like). I fixed that.
- *
- *  - In information_transfer(), I think a #ifdef was wrong. Looking at the
- *    possible DMA transfer size should also happen for REAL_DMA. I added this
- *    in the #if statement.
- *
- *  - When using real DMA, information_transfer() should return in a DATAOUT
- *    phase after starting the DMA. It has nothing more to do.
- *
- *  - The interrupt service routine should run main after end of DMA, too (not
- *    only after RESELECTION interrupts). Additionally, it should _not_ test
- *    for more interrupts after running main, since a DMA process may have
- *    been started and interrupts are turned on now. The new int could happen
- *    inside the execution of NCR5380_intr(), leading to recursive
- *    calls.
- *
- *  - I've added a function merge_contiguous_buffers() that tries to
- *    merge scatter-gather buffers that are located at contiguous
- *    physical addresses and can be processed with the same DMA setup.
- *    Since most scatter-gather operations work on a page (4K) of
- *    4 buffers (1K), in more than 90% of all cases three interrupts and
- *    DMA setup actions are saved.
- *
- * - I've deleted all the stuff for AUTOPROBE_IRQ, REAL_DMA_POLL, PSEUDO_DMA
- *    and USLEEP, because these were messing up readability and will never be
- *    needed for Atari SCSI.
- *
- * - I've revised the NCR5380_main() calling scheme (relax the 'main_running'
- *   stuff), and 'main' is executed in a bottom half if awoken by an
- *   interrupt.
- *
- * - The code was quite cluttered up by "#if (NDEBUG & NDEBUG_*) printk..."
- *   constructs. In my eyes, this made the source rather unreadable, so I
- *   finally replaced that by the *_PRINTK() macros.
- *
- */
-
-/*
- * Further development / testing that should be done :
- * 1.  Test linked command handling code after Eric is ready with
- *     the high level code.
- */
+/* Ported to Atari by Roman Hodek and others. */
 
 /* Adapted for the sun3 by Sam Creasey. */
 
-#include <scsi/scsi_dbg.h>
-#include <scsi/scsi_transport_spi.h>
-
-#if (NDEBUG & NDEBUG_LISTS)
-#define LIST(x, y)                                             \
-       do {                                                    \
-               printk("LINE:%d   Adding %p to %p\n",           \
-                      __LINE__, (void*)(x), (void*)(y));       \
-               if ((x) == (y))                                 \
-                       udelay(5);                              \
-       } while (0)
-#define REMOVE(w, x, y, z)                                     \
-       do {                                                    \
-               printk("LINE:%d   Removing: %p->%p  %p->%p \n", \
-                      __LINE__, (void*)(w), (void*)(x),        \
-                      (void*)(y), (void*)(z));                 \
-               if ((x) == (y))                                 \
-                       udelay(5);                              \
-       } while (0)
-#else
-#define LIST(x,y)
-#define REMOVE(w,x,y,z)
-#endif
-
-#ifndef notyet
-#undef LINKED
-#endif
-
 /*
  * Design
  *
  * piece of hardware that requires you to sit in a loop polling for
  * the REQ signal as long as you are connected.  Some devices are
  * brain dead (ie, many TEXEL CD ROM drives) and won't disconnect
- * while doing long seek operations.
- *
- * The workaround for this is to keep track of devices that have
- * disconnected.  If the device hasn't disconnected, for commands that
- * should disconnect, we do something like
- *
- * while (!REQ is asserted) { sleep for N usecs; poll for M usecs }
- *
- * Some tweaking of N and M needs to be done.  An algorithm based
- * on "time to data" would give the best results as long as short time
- * to datas (ie, on the same track) were considered, however these
+ * while doing long seek operations. [...] These
  * broken devices are the exception rather than the rule and I'd rather
  * spend my time optimizing for the normal case.
  *
  *
  * These macros control options :
  * AUTOSENSE - if defined, REQUEST SENSE will be performed automatically
- *     for commands that return with a CHECK CONDITION status.
+ * for commands that return with a CHECK CONDITION status.
  *
  * DIFFERENTIAL - if defined, NCR53c81 chips will use external differential
- *     transceivers.
- *
- * LINKED - if defined, linked commands are supported.
+ * transceivers.
  *
  * REAL_DMA - if defined, REAL DMA is used during the data transfer phases.
  *
  * NCR5380_write(register, value) - write to the specific register
  *
  * NCR5380_implementation_fields  - additional fields needed for this
- *      specific implementation of the NCR5380
+ * specific implementation of the NCR5380
  *
  * Either real DMA *or* pseudo DMA may be implemented
  * REAL functions :
  * NCR5380_REAL_DMA should be defined if real DMA is to be used.
  * Note that the DMA setup functions should return the number of bytes
- *     that they were able to program the controller for.
+ * that they were able to program the controller for.
  *
  * Also note that generic i386/PC versions of these macros are
- *     available as NCR5380_i386_dma_write_setup,
- *     NCR5380_i386_dma_read_setup, and NCR5380_i386_dma_residual.
+ * available as NCR5380_i386_dma_write_setup,
+ * NCR5380_i386_dma_read_setup, and NCR5380_i386_dma_residual.
  *
  * NCR5380_dma_write_setup(instance, src, count) - initialize
  * NCR5380_dma_read_setup(instance, dst, count) - initialize
  * possible) function may be used.
  */
 
-/* Macros ease life... :-) */
-#define        SETUP_HOSTDATA(in)                              \
-    struct NCR5380_hostdata *hostdata =                        \
-       (struct NCR5380_hostdata *)(in)->hostdata
-#define        HOSTDATA(in) ((struct NCR5380_hostdata *)(in)->hostdata)
-
-#define        NEXT(cmd)               ((struct scsi_cmnd *)(cmd)->host_scribble)
-#define        SET_NEXT(cmd,next)      ((cmd)->host_scribble = (void *)(next))
-#define        NEXTADDR(cmd)           ((struct scsi_cmnd **)&(cmd)->host_scribble)
-
-#define        HOSTNO          instance->host_no
-#define        H_NO(cmd)       (cmd)->device->host->host_no
+static int do_abort(struct Scsi_Host *);
+static void do_reset(struct Scsi_Host *);
 
 #ifdef SUPPORT_TAGS
 
  * cannot know it in advance :-( We just see a QUEUE_FULL status being
  * returned. So, in this case, the driver internal queue size assumption is
  * reduced to the number of active tags if QUEUE_FULL is returned by the
- * target. The command is returned to the mid-level, but with status changed
- * to BUSY, since --as I've seen-- the mid-level can't handle QUEUE_FULL
- * correctly.
+ * target.
  *
  * We're also not allowed running tagged commands as long as an untagged
  * command is active. And REQUEST SENSE commands after a contingent allegiance
@@ -304,7 +206,8 @@ static void __init init_tags(struct NCR5380_hostdata *hostdata)
 static int is_lun_busy(struct scsi_cmnd *cmd, int should_be_tagged)
 {
        u8 lun = cmd->device->lun;
-       SETUP_HOSTDATA(cmd->device->host);
+       struct Scsi_Host *instance = cmd->device->host;
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
 
        if (hostdata->busy[cmd->device->id] & (1 << lun))
                return 1;
@@ -314,8 +217,8 @@ static int is_lun_busy(struct scsi_cmnd *cmd, int should_be_tagged)
                return 0;
        if (hostdata->TagAlloc[scmd_id(cmd)][lun].nr_allocated >=
            hostdata->TagAlloc[scmd_id(cmd)][lun].queue_size) {
-               dprintk(NDEBUG_TAGS, "scsi%d: target %d lun %d: no free tags\n",
-                          H_NO(cmd), cmd->device->id, lun);
+               dsprintk(NDEBUG_TAGS, instance, "target %d lun %d: no free tags\n",
+                        scmd_id(cmd), lun);
                return 1;
        }
        return 0;
@@ -330,7 +233,8 @@ static int is_lun_busy(struct scsi_cmnd *cmd, int should_be_tagged)
 static void cmd_get_tag(struct scsi_cmnd *cmd, int should_be_tagged)
 {
        u8 lun = cmd->device->lun;
-       SETUP_HOSTDATA(cmd->device->host);
+       struct Scsi_Host *instance = cmd->device->host;
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
 
        /* If we or the target don't support tagged queuing, allocate the LUN for
         * an untagged command.
@@ -340,18 +244,16 @@ static void cmd_get_tag(struct scsi_cmnd *cmd, int should_be_tagged)
            !cmd->device->tagged_supported) {
                cmd->tag = TAG_NONE;
                hostdata->busy[cmd->device->id] |= (1 << lun);
-               dprintk(NDEBUG_TAGS, "scsi%d: target %d lun %d now allocated by untagged "
-                          "command\n", H_NO(cmd), cmd->device->id, lun);
+               dsprintk(NDEBUG_TAGS, instance, "target %d lun %d now allocated by untagged command\n",
+                        scmd_id(cmd), lun);
        } else {
                struct tag_alloc *ta = &hostdata->TagAlloc[scmd_id(cmd)][lun];
 
                cmd->tag = find_first_zero_bit(ta->allocated, MAX_TAGS);
                set_bit(cmd->tag, ta->allocated);
                ta->nr_allocated++;
-               dprintk(NDEBUG_TAGS, "scsi%d: using tag %d for target %d lun %d "
-                          "(now %d tags in use)\n",
-                          H_NO(cmd), cmd->tag, cmd->device->id,
-                          lun, ta->nr_allocated);
+               dsprintk(NDEBUG_TAGS, instance, "using tag %d for target %d lun %d (%d tags allocated)\n",
+                        cmd->tag, scmd_id(cmd), lun, ta->nr_allocated);
        }
 }
 
@@ -363,21 +265,22 @@ static void cmd_get_tag(struct scsi_cmnd *cmd, int should_be_tagged)
 static void cmd_free_tag(struct scsi_cmnd *cmd)
 {
        u8 lun = cmd->device->lun;
-       SETUP_HOSTDATA(cmd->device->host);
+       struct Scsi_Host *instance = cmd->device->host;
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
 
        if (cmd->tag == TAG_NONE) {
                hostdata->busy[cmd->device->id] &= ~(1 << lun);
-               dprintk(NDEBUG_TAGS, "scsi%d: target %d lun %d untagged cmd finished\n",
-                          H_NO(cmd), cmd->device->id, lun);
+               dsprintk(NDEBUG_TAGS, instance, "target %d lun %d untagged cmd freed\n",
+                        scmd_id(cmd), lun);
        } else if (cmd->tag >= MAX_TAGS) {
-               printk(KERN_NOTICE "scsi%d: trying to free bad tag %d!\n",
-                      H_NO(cmd), cmd->tag);
+               shost_printk(KERN_NOTICE, instance,
+                            "trying to free bad tag %d!\n", cmd->tag);
        } else {
                struct tag_alloc *ta = &hostdata->TagAlloc[scmd_id(cmd)][lun];
                clear_bit(cmd->tag, ta->allocated);
                ta->nr_allocated--;
-               dprintk(NDEBUG_TAGS, "scsi%d: freed tag %d for target %d lun %d\n",
-                          H_NO(cmd), cmd->tag, cmd->device->id, lun);
+               dsprintk(NDEBUG_TAGS, instance, "freed tag %d for target %d lun %d\n",
+                        cmd->tag, scmd_id(cmd), lun);
        }
 }
 
@@ -401,17 +304,15 @@ static void free_all_tags(struct NCR5380_hostdata *hostdata)
 
 #endif /* SUPPORT_TAGS */
 
-
-/*
- * Function: void merge_contiguous_buffers( struct scsi_cmnd *cmd )
- *
- * Purpose: Try to merge several scatter-gather requests into one DMA
- *    transfer. This is possible if the scatter buffers lie on
- *    physical contiguous addresses.
- *
- * Parameters: struct scsi_cmnd *cmd
- *    The command to work on. The first scatter buffer's data are
- *    assumed to be already transferred into ptr/this_residual.
+/**
+ * merge_contiguous_buffers - coalesce scatter-gather list entries
+ * @cmd: command requesting IO
+ *
+ * Try to merge several scatter-gather buffers into one DMA transfer.
+ * This is possible if the scatter buffers lie on physically
+ * contiguous addresses. The first scatter-gather buffer's data are
+ * assumed to be already transferred into cmd->SCp.this_residual.
+ * Every buffer merged avoids an interrupt and a DMA setup operation.
  */
 
 static void merge_contiguous_buffers(struct scsi_cmnd *cmd)
@@ -463,9 +364,7 @@ static inline void initialize_SCp(struct scsi_cmnd *cmd)
                cmd->SCp.buffers_residual = scsi_sg_count(cmd) - 1;
                cmd->SCp.ptr = sg_virt(cmd->SCp.buffer);
                cmd->SCp.this_residual = cmd->SCp.buffer->length;
-               /* ++roman: Try to merge some scatter-buffers if they are at
-                * contiguous physical addresses.
-                */
+
                merge_contiguous_buffers(cmd);
        } else {
                cmd->SCp.buffer = NULL;
@@ -473,31 +372,110 @@ static inline void initialize_SCp(struct scsi_cmnd *cmd)
                cmd->SCp.ptr = NULL;
                cmd->SCp.this_residual = 0;
        }
+
+       cmd->SCp.Status = 0;
+       cmd->SCp.Message = 0;
+}
+
+/**
+ * NCR5380_poll_politely2 - wait for two chip register values
+ * @instance: controller to poll
+ * @reg1: 5380 register to poll
+ * @bit1: Bitmask to check
+ * @val1: Expected value
+ * @reg2: Second 5380 register to poll
+ * @bit2: Second bitmask to check
+ * @val2: Second expected value
+ * @wait: Time-out in jiffies
+ *
+ * Polls the chip in a reasonably efficient manner waiting for an
+ * event to occur. After a short quick poll we begin to yield the CPU
+ * (if possible). In irq contexts the time-out is arbitrarily limited.
+ * Callers may hold locks as long as they are held in irq mode.
+ *
+ * Returns 0 if either or both event(s) occurred otherwise -ETIMEDOUT.
+ */
+
+static int NCR5380_poll_politely2(struct Scsi_Host *instance,
+                                  int reg1, int bit1, int val1,
+                                  int reg2, int bit2, int val2, int wait)
+{
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
+       unsigned long deadline = jiffies + wait;
+       unsigned long n;
+
+       /* Busy-wait for up to 10 ms */
+       n = min(10000U, jiffies_to_usecs(wait));
+       n *= hostdata->accesses_per_ms;
+       n /= 2000;
+       do {
+               if ((NCR5380_read(reg1) & bit1) == val1)
+                       return 0;
+               if ((NCR5380_read(reg2) & bit2) == val2)
+                       return 0;
+               cpu_relax();
+       } while (n--);
+
+       if (irqs_disabled() || in_interrupt())
+               return -ETIMEDOUT;
+
+       /* Repeatedly sleep for 1 ms until deadline */
+       while (time_is_after_jiffies(deadline)) {
+               schedule_timeout_uninterruptible(1);
+               if ((NCR5380_read(reg1) & bit1) == val1)
+                       return 0;
+               if ((NCR5380_read(reg2) & bit2) == val2)
+                       return 0;
+       }
+
+       return -ETIMEDOUT;
 }
 
-#include <linux/delay.h>
+static inline int NCR5380_poll_politely(struct Scsi_Host *instance,
+                                        int reg, int bit, int val, int wait)
+{
+       return NCR5380_poll_politely2(instance, reg, bit, val,
+                                               reg, bit, val, wait);
+}
 
 #if NDEBUG
 static struct {
        unsigned char mask;
        const char *name;
 } signals[] = {
-       { SR_DBP, "PARITY"}, { SR_RST, "RST" }, { SR_BSY, "BSY" },
-       { SR_REQ, "REQ" }, { SR_MSG, "MSG" }, { SR_CD,  "CD" }, { SR_IO, "IO" },
-       { SR_SEL, "SEL" }, {0, NULL}
-}, basrs[] = {
-       {BASR_ATN, "ATN"}, {BASR_ACK, "ACK"}, {0, NULL}
-}, icrs[] = {
-       {ICR_ASSERT_RST, "ASSERT RST"},{ICR_ASSERT_ACK, "ASSERT ACK"},
-       {ICR_ASSERT_BSY, "ASSERT BSY"}, {ICR_ASSERT_SEL, "ASSERT SEL"},
-       {ICR_ASSERT_ATN, "ASSERT ATN"}, {ICR_ASSERT_DATA, "ASSERT DATA"},
+       {SR_DBP, "PARITY"},
+       {SR_RST, "RST"},
+       {SR_BSY, "BSY"},
+       {SR_REQ, "REQ"},
+       {SR_MSG, "MSG"},
+       {SR_CD, "CD"},
+       {SR_IO, "IO"},
+       {SR_SEL, "SEL"},
        {0, NULL}
-}, mrs[] = {
-       {MR_BLOCK_DMA_MODE, "MODE BLOCK DMA"}, {MR_TARGET, "MODE TARGET"},
-       {MR_ENABLE_PAR_CHECK, "MODE PARITY CHECK"}, {MR_ENABLE_PAR_INTR,
-       "MODE PARITY INTR"}, {MR_ENABLE_EOP_INTR,"MODE EOP INTR"},
+},
+basrs[] = {
+       {BASR_ATN, "ATN"},
+       {BASR_ACK, "ACK"},
+       {0, NULL}
+},
+icrs[] = {
+       {ICR_ASSERT_RST, "ASSERT RST"},
+       {ICR_ASSERT_ACK, "ASSERT ACK"},
+       {ICR_ASSERT_BSY, "ASSERT BSY"},
+       {ICR_ASSERT_SEL, "ASSERT SEL"},
+       {ICR_ASSERT_ATN, "ASSERT ATN"},
+       {ICR_ASSERT_DATA, "ASSERT DATA"},
+       {0, NULL}
+},
+mrs[] = {
+       {MR_BLOCK_DMA_MODE, "MODE BLOCK DMA"},
+       {MR_TARGET, "MODE TARGET"},
+       {MR_ENABLE_PAR_CHECK, "MODE PARITY CHECK"},
+       {MR_ENABLE_PAR_INTR, "MODE PARITY INTR"},
+       {MR_ENABLE_EOP_INTR, "MODE EOP INTR"},
        {MR_MONITOR_BSY, "MODE MONITOR BSY"},
-       {MR_DMA_MODE, "MODE DMA"}, {MR_ARBITRATE, "MODE ARBITRATION"},
+       {MR_DMA_MODE, "MODE DMA"},
+       {MR_ARBITRATE, "MODE ARBITRATION"},
        {0, NULL}
 };
 
@@ -511,15 +489,13 @@ static struct {
 static void NCR5380_print(struct Scsi_Host *instance)
 {
        unsigned char status, data, basr, mr, icr, i;
-       unsigned long flags;
 
-       local_irq_save(flags);
        data = NCR5380_read(CURRENT_SCSI_DATA_REG);
        status = NCR5380_read(STATUS_REG);
        mr = NCR5380_read(MODE_REG);
        icr = NCR5380_read(INITIATOR_COMMAND_REG);
        basr = NCR5380_read(BUS_AND_STATUS_REG);
-       local_irq_restore(flags);
+
        printk("STATUS_REG: %02x ", status);
        for (i = 0; signals[i].mask; ++i)
                if (status & signals[i].mask)
@@ -543,8 +519,12 @@ static struct {
        unsigned char value;
        const char *name;
 } phases[] = {
-       {PHASE_DATAOUT, "DATAOUT"}, {PHASE_DATAIN, "DATAIN"}, {PHASE_CMDOUT, "CMDOUT"},
-       {PHASE_STATIN, "STATIN"}, {PHASE_MSGOUT, "MSGOUT"}, {PHASE_MSGIN, "MSGIN"},
+       {PHASE_DATAOUT, "DATAOUT"},
+       {PHASE_DATAIN, "DATAIN"},
+       {PHASE_CMDOUT, "CMDOUT"},
+       {PHASE_STATIN, "STATIN"},
+       {PHASE_MSGOUT, "MSGOUT"},
+       {PHASE_MSGIN, "MSGIN"},
        {PHASE_UNKNOWN, "UNKNOWN"}
 };
 
@@ -553,8 +533,6 @@ static struct {
  * @instance: adapter to dump
  *
  * Print the current SCSI phase for debugging purposes
- *
- * Locks: none
  */
 
 static void NCR5380_print_phase(struct Scsi_Host *instance)
@@ -564,54 +542,21 @@ static void NCR5380_print_phase(struct Scsi_Host *instance)
 
        status = NCR5380_read(STATUS_REG);
        if (!(status & SR_REQ))
-               printk(KERN_DEBUG "scsi%d: REQ not asserted, phase unknown.\n", HOSTNO);
+               shost_printk(KERN_DEBUG, instance, "REQ not asserted, phase unknown.\n");
        else {
                for (i = 0; (phases[i].value != PHASE_UNKNOWN) &&
                     (phases[i].value != (status & PHASE_MASK)); ++i)
                        ;
-               printk(KERN_DEBUG "scsi%d: phase %s\n", HOSTNO, phases[i].name);
+               shost_printk(KERN_DEBUG, instance, "phase %s\n", phases[i].name);
        }
 }
-
 #endif
 
-/*
- * ++roman: New scheme of calling NCR5380_main()
- *
- * If we're not in an interrupt, we can call our main directly, it cannot be
- * already running. Else, we queue it on a task queue, if not 'main_running'
- * tells us that a lower level is already executing it. This way,
- * 'main_running' needs not be protected in a special way.
- *
- * queue_main() is a utility function for putting our main onto the task
- * queue, if main_running is false. It should be called only from a
- * interrupt or bottom half.
- */
-
-#include <linux/gfp.h>
-#include <linux/workqueue.h>
-#include <linux/interrupt.h>
-
-static inline void queue_main(struct NCR5380_hostdata *hostdata)
-{
-       if (!hostdata->main_running) {
-               /* If in interrupt and NCR5380_main() not already running,
-                  queue it on the 'immediate' task queue, to be processed
-                  immediately after the current interrupt processing has
-                  finished. */
-               schedule_work(&hostdata->main_task);
-       }
-       /* else: nothing to do: the running NCR5380_main() will pick up
-          any newly queued command. */
-}
-
 /**
  * NCR58380_info - report driver and host information
  * @instance: relevant scsi host instance
  *
  * For use as the host template info() handler.
- *
- * Locks: none
  */
 
 static const char *NCR5380_info(struct Scsi_Host *instance)
@@ -630,13 +575,14 @@ static void prepare_info(struct Scsi_Host *instance)
                 "base 0x%lx, irq %d, "
                 "can_queue %d, cmd_per_lun %d, "
                 "sg_tablesize %d, this_id %d, "
-                "flags { %s}, "
+                "flags { %s%s}, "
                 "options { %s} ",
                 instance->hostt->name, instance->io_port, instance->n_io_port,
                 instance->base, instance->irq,
                 instance->can_queue, instance->cmd_per_lun,
                 instance->sg_tablesize, instance->this_id,
                 hostdata->flags & FLAG_TAGGED_QUEUING ? "TAGGED_QUEUING " : "",
+                hostdata->flags & FLAG_TOSHIBA_DELAY  ? "TOSHIBA_DELAY "  : "",
 #ifdef DIFFERENTIAL
                 "DIFFERENTIAL "
 #endif
@@ -652,102 +598,6 @@ static void prepare_info(struct Scsi_Host *instance)
                 "");
 }
 
-/**
- * NCR5380_print_status - dump controller info
- * @instance: controller to dump
- *
- * Print commands in the various queues, called from NCR5380_abort
- * to aid debugging.
- */
-
-static void lprint_Scsi_Cmnd(struct scsi_cmnd *cmd)
-{
-       int i, s;
-       unsigned char *command;
-       printk("scsi%d: destination target %d, lun %llu\n",
-               H_NO(cmd), cmd->device->id, cmd->device->lun);
-       printk(KERN_CONT "        command = ");
-       command = cmd->cmnd;
-       printk(KERN_CONT "%2d (0x%02x)", command[0], command[0]);
-       for (i = 1, s = COMMAND_SIZE(command[0]); i < s; ++i)
-               printk(KERN_CONT " %02x", command[i]);
-       printk("\n");
-}
-
-static void NCR5380_print_status(struct Scsi_Host *instance)
-{
-       struct NCR5380_hostdata *hostdata;
-       struct scsi_cmnd *ptr;
-       unsigned long flags;
-
-       NCR5380_dprint(NDEBUG_ANY, instance);
-       NCR5380_dprint_phase(NDEBUG_ANY, instance);
-
-       hostdata = (struct NCR5380_hostdata *)instance->hostdata;
-
-       local_irq_save(flags);
-       printk("NCR5380: coroutine is%s running.\n",
-               hostdata->main_running ? "" : "n't");
-       if (!hostdata->connected)
-               printk("scsi%d: no currently connected command\n", HOSTNO);
-       else
-               lprint_Scsi_Cmnd((struct scsi_cmnd *) hostdata->connected);
-       printk("scsi%d: issue_queue\n", HOSTNO);
-       for (ptr = (struct scsi_cmnd *)hostdata->issue_queue; ptr; ptr = NEXT(ptr))
-               lprint_Scsi_Cmnd(ptr);
-
-       printk("scsi%d: disconnected_queue\n", HOSTNO);
-       for (ptr = (struct scsi_cmnd *) hostdata->disconnected_queue; ptr;
-            ptr = NEXT(ptr))
-               lprint_Scsi_Cmnd(ptr);
-
-       local_irq_restore(flags);
-       printk("\n");
-}
-
-static void show_Scsi_Cmnd(struct scsi_cmnd *cmd, struct seq_file *m)
-{
-       int i, s;
-       unsigned char *command;
-       seq_printf(m, "scsi%d: destination target %d, lun %llu\n",
-               H_NO(cmd), cmd->device->id, cmd->device->lun);
-       seq_puts(m, "        command = ");
-       command = cmd->cmnd;
-       seq_printf(m, "%2d (0x%02x)", command[0], command[0]);
-       for (i = 1, s = COMMAND_SIZE(command[0]); i < s; ++i)
-               seq_printf(m, " %02x", command[i]);
-       seq_putc(m, '\n');
-}
-
-static int __maybe_unused NCR5380_show_info(struct seq_file *m,
-                                            struct Scsi_Host *instance)
-{
-       struct NCR5380_hostdata *hostdata;
-       struct scsi_cmnd *ptr;
-       unsigned long flags;
-
-       hostdata = (struct NCR5380_hostdata *)instance->hostdata;
-
-       local_irq_save(flags);
-       seq_printf(m, "NCR5380: coroutine is%s running.\n",
-               hostdata->main_running ? "" : "n't");
-       if (!hostdata->connected)
-               seq_printf(m, "scsi%d: no currently connected command\n", HOSTNO);
-       else
-               show_Scsi_Cmnd((struct scsi_cmnd *) hostdata->connected, m);
-       seq_printf(m, "scsi%d: issue_queue\n", HOSTNO);
-       for (ptr = (struct scsi_cmnd *)hostdata->issue_queue; ptr; ptr = NEXT(ptr))
-               show_Scsi_Cmnd(ptr, m);
-
-       seq_printf(m, "scsi%d: disconnected_queue\n", HOSTNO);
-       for (ptr = (struct scsi_cmnd *) hostdata->disconnected_queue; ptr;
-            ptr = NEXT(ptr))
-               show_Scsi_Cmnd(ptr, m);
-
-       local_irq_restore(flags);
-       return 0;
-}
-
 /**
  * NCR5380_init - initialise an NCR5380
  * @instance: adapter to configure
@@ -764,11 +614,11 @@ static int __maybe_unused NCR5380_show_info(struct seq_file *m,
 
 static int __init NCR5380_init(struct Scsi_Host *instance, int flags)
 {
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
        int i;
-       SETUP_HOSTDATA(instance);
+       unsigned long deadline;
 
        hostdata->host = instance;
-       hostdata->aborted = 0;
        hostdata->id_mask = 1 << instance->this_id;
        hostdata->id_higher_mask = 0;
        for (i = hostdata->id_mask; i <= 0x80; i <<= 1)
@@ -782,13 +632,21 @@ static int __init NCR5380_init(struct Scsi_Host *instance, int flags)
 #if defined (REAL_DMA)
        hostdata->dma_len = 0;
 #endif
-       hostdata->targets_present = 0;
+       spin_lock_init(&hostdata->lock);
        hostdata->connected = NULL;
-       hostdata->issue_queue = NULL;
-       hostdata->disconnected_queue = NULL;
+       hostdata->sensing = NULL;
+       INIT_LIST_HEAD(&hostdata->autosense);
+       INIT_LIST_HEAD(&hostdata->unissued);
+       INIT_LIST_HEAD(&hostdata->disconnected);
+
        hostdata->flags = flags;
 
        INIT_WORK(&hostdata->main_task, NCR5380_main);
+       hostdata->work_q = alloc_workqueue("ncr5380_%d",
+                               WQ_UNBOUND | WQ_MEM_RECLAIM,
+                               1, instance->host_no);
+       if (!hostdata->work_q)
+               return -ENOMEM;
 
        prepare_info(instance);
 
@@ -797,6 +655,72 @@ static int __init NCR5380_init(struct Scsi_Host *instance, int flags)
        NCR5380_write(TARGET_COMMAND_REG, 0);
        NCR5380_write(SELECT_ENABLE_REG, 0);
 
+       /* Calibrate register polling loop */
+       i = 0;
+       deadline = jiffies + 1;
+       do {
+               cpu_relax();
+       } while (time_is_after_jiffies(deadline));
+       deadline += msecs_to_jiffies(256);
+       do {
+               NCR5380_read(STATUS_REG);
+               ++i;
+               cpu_relax();
+       } while (time_is_after_jiffies(deadline));
+       hostdata->accesses_per_ms = i / 256;
+
+       return 0;
+}
+
+/**
+ * NCR5380_maybe_reset_bus - Detect and correct bus wedge problems.
+ * @instance: adapter to check
+ *
+ * If the system crashed, it may have crashed with a connected target and
+ * the SCSI bus busy. Check for BUS FREE phase. If not, try to abort the
+ * currently established nexus, which we know nothing about. Failing that
+ * do a bus reset.
+ *
+ * Note that a bus reset will cause the chip to assert IRQ.
+ *
+ * Returns 0 if successful, otherwise -ENXIO.
+ */
+
+static int NCR5380_maybe_reset_bus(struct Scsi_Host *instance)
+{
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
+       int pass;
+
+       for (pass = 1; (NCR5380_read(STATUS_REG) & SR_BSY) && pass <= 6; ++pass) {
+               switch (pass) {
+               case 1:
+               case 3:
+               case 5:
+                       shost_printk(KERN_ERR, instance, "SCSI bus busy, waiting up to five seconds\n");
+                       NCR5380_poll_politely(instance,
+                                             STATUS_REG, SR_BSY, 0, 5 * HZ);
+                       break;
+               case 2:
+                       shost_printk(KERN_ERR, instance, "bus busy, attempting abort\n");
+                       do_abort(instance);
+                       break;
+               case 4:
+                       shost_printk(KERN_ERR, instance, "bus busy, attempting reset\n");
+                       do_reset(instance);
+                       /* Wait after a reset; the SCSI standard calls for
+                        * 250ms, we wait 500ms to be on the safe side.
+                        * But some Toshiba CD-ROMs need ten times that.
+                        */
+                       if (hostdata->flags & FLAG_TOSHIBA_DELAY)
+                               msleep(2500);
+                       else
+                               msleep(500);
+                       break;
+               case 6:
+                       shost_printk(KERN_ERR, instance, "bus locked solid\n");
+                       return -ENXIO;
+               }
+       }
        return 0;
 }
 
@@ -812,6 +736,38 @@ static void NCR5380_exit(struct Scsi_Host *instance)
        struct NCR5380_hostdata *hostdata = shost_priv(instance);
 
        cancel_work_sync(&hostdata->main_task);
+       destroy_workqueue(hostdata->work_q);
+}
+
+/**
+ * complete_cmd - finish processing a command and return it to the SCSI ML
+ * @instance: the host instance
+ * @cmd: command to complete
+ */
+
+static void complete_cmd(struct Scsi_Host *instance,
+                         struct scsi_cmnd *cmd)
+{
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
+
+       dsprintk(NDEBUG_QUEUES, instance, "complete_cmd: cmd %p\n", cmd);
+
+       if (hostdata->sensing == cmd) {
+               /* Autosense processing ends here */
+               if ((cmd->result & 0xff) != SAM_STAT_GOOD) {
+                       scsi_eh_restore_cmnd(cmd, &hostdata->ses);
+                       set_host_byte(cmd, DID_ERROR);
+               } else
+                       scsi_eh_restore_cmnd(cmd, &hostdata->ses);
+               hostdata->sensing = NULL;
+       }
+
+#ifdef SUPPORT_TAGS
+       cmd_free_tag(cmd);
+#else
+       hostdata->busy[scmd_id(cmd)] &= ~(1 << cmd->device->lun);
+#endif
+       cmd->scsi_done(cmd);
 }
 
 /**
@@ -819,7 +775,7 @@ static void NCR5380_exit(struct Scsi_Host *instance)
  * @instance: the relevant SCSI adapter
  * @cmd: SCSI command
  *
- * cmd is added to the per instance issue_queue, with minor
+ * cmd is added to the per-instance issue queue, with minor
  * twiddling done to the host specific fields of cmd.  If the
  * main coroutine is not running, it is restarted.
  */
@@ -828,44 +784,23 @@ static int NCR5380_queue_command(struct Scsi_Host *instance,
                                  struct scsi_cmnd *cmd)
 {
        struct NCR5380_hostdata *hostdata = shost_priv(instance);
-       struct scsi_cmnd *tmp;
+       struct NCR5380_cmd *ncmd = scsi_cmd_priv(cmd);
        unsigned long flags;
 
 #if (NDEBUG & NDEBUG_NO_WRITE)
        switch (cmd->cmnd[0]) {
        case WRITE_6:
        case WRITE_10:
-               printk(KERN_NOTICE "scsi%d: WRITE attempted with NO_WRITE debugging flag set\n",
-                      H_NO(cmd));
+               shost_printk(KERN_DEBUG, instance, "WRITE attempted with NDEBUG_NO_WRITE set\n");
                cmd->result = (DID_ERROR << 16);
                cmd->scsi_done(cmd);
                return 0;
        }
 #endif /* (NDEBUG & NDEBUG_NO_WRITE) */
 
-       /*
-        * We use the host_scribble field as a pointer to the next command
-        * in a queue
-        */
-
-       SET_NEXT(cmd, NULL);
        cmd->result = 0;
 
        /*
-        * Insert the cmd into the issue queue. Note that REQUEST SENSE
-        * commands are added to the head of the queue since any command will
-        * clear the contingent allegiance condition that exists and the
-        * sense data is only guaranteed to be valid while the condition exists.
-        */
-
-       /* ++guenther: now that the issue queue is being set up, we can lock ST-DMA.
-        * Otherwise a running NCR5380_main may steal the lock.
-        * Lock before actually inserting due to fairness reasons explained in
-        * atari_scsi.c. If we insert first, then it's impossible for this driver
-        * to release the lock.
-        * Stop timer for this command while waiting for the lock, or timeouts
-        * may happen (and they really do), and it's no good if the command doesn't
-        * appear in any of the queues.
         * ++roman: Just disabling the NCR interrupt isn't sufficient here,
         * because also a timer int can trigger an abort or reset, which would
         * alter queues and touch the lock.
@@ -873,7 +808,7 @@ static int NCR5380_queue_command(struct Scsi_Host *instance,
        if (!NCR5380_acquire_dma_irq(instance))
                return SCSI_MLQUEUE_HOST_BUSY;
 
-       local_irq_save(flags);
+       spin_lock_irqsave(&hostdata->lock, flags);
 
        /*
         * Insert the cmd into the issue queue. Note that REQUEST SENSE
@@ -882,33 +817,18 @@ static int NCR5380_queue_command(struct Scsi_Host *instance,
         * sense data is only guaranteed to be valid while the condition exists.
         */
 
-       if (!(hostdata->issue_queue) || (cmd->cmnd[0] == REQUEST_SENSE)) {
-               LIST(cmd, hostdata->issue_queue);
-               SET_NEXT(cmd, hostdata->issue_queue);
-               hostdata->issue_queue = cmd;
-       } else {
-               for (tmp = (struct scsi_cmnd *)hostdata->issue_queue;
-                    NEXT(tmp); tmp = NEXT(tmp))
-                       ;
-               LIST(cmd, tmp);
-               SET_NEXT(tmp, cmd);
-       }
-       local_irq_restore(flags);
+       if (cmd->cmnd[0] == REQUEST_SENSE)
+               list_add(&ncmd->list, &hostdata->unissued);
+       else
+               list_add_tail(&ncmd->list, &hostdata->unissued);
 
-       dprintk(NDEBUG_QUEUES, "scsi%d: command added to %s of queue\n", H_NO(cmd),
-                 (cmd->cmnd[0] == REQUEST_SENSE) ? "head" : "tail");
+       spin_unlock_irqrestore(&hostdata->lock, flags);
 
-       /* If queue_command() is called from an interrupt (real one or bottom
-        * half), we let queue_main() do the job of taking care about main. If it
-        * is already running, this is a no-op, else main will be queued.
-        *
-        * If we're not in an interrupt, we can call NCR5380_main()
-        * unconditionally, because it cannot be already running.
-        */
-       if (in_interrupt() || irqs_disabled())
-               queue_main(hostdata);
-       else
-               NCR5380_main(&hostdata->main_task);
+       dsprintk(NDEBUG_QUEUES, instance, "command %p added to %s of queue\n",
+                cmd, (cmd->cmnd[0] == REQUEST_SENSE) ? "head" : "tail");
+
+       /* Kick off command processing */
+       queue_work(hostdata->work_q, &hostdata->main_task);
        return 0;
 }
 
@@ -917,13 +837,78 @@ static inline void maybe_release_dma_irq(struct Scsi_Host *instance)
        struct NCR5380_hostdata *hostdata = shost_priv(instance);
 
        /* Caller does the locking needed to set & test these data atomically */
-       if (!hostdata->disconnected_queue &&
-           !hostdata->issue_queue &&
+       if (list_empty(&hostdata->disconnected) &&
+           list_empty(&hostdata->unissued) &&
+           list_empty(&hostdata->autosense) &&
            !hostdata->connected &&
-           !hostdata->retain_dma_intr)
+           !hostdata->selecting)
                NCR5380_release_dma_irq(instance);
 }
 
+/**
+ * dequeue_next_cmd - dequeue a command for processing
+ * @instance: the scsi host instance
+ *
+ * Priority is given to commands on the autosense queue. These commands
+ * need autosense because of a CHECK CONDITION result.
+ *
+ * Returns a command pointer if a command is found for a target that is
+ * not already busy. Otherwise returns NULL.
+ */
+
+static struct scsi_cmnd *dequeue_next_cmd(struct Scsi_Host *instance)
+{
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
+       struct NCR5380_cmd *ncmd;
+       struct scsi_cmnd *cmd;
+
+       if (list_empty(&hostdata->autosense)) {
+               list_for_each_entry(ncmd, &hostdata->unissued, list) {
+                       cmd = NCR5380_to_scmd(ncmd);
+                       dsprintk(NDEBUG_QUEUES, instance, "dequeue: cmd=%p target=%d busy=0x%02x lun=%llu\n",
+                                cmd, scmd_id(cmd), hostdata->busy[scmd_id(cmd)], cmd->device->lun);
+
+                       if (
+#ifdef SUPPORT_TAGS
+                           !is_lun_busy(cmd, 1)
+#else
+                           !(hostdata->busy[scmd_id(cmd)] & (1 << cmd->device->lun))
+#endif
+                       ) {
+                               list_del(&ncmd->list);
+                               dsprintk(NDEBUG_QUEUES, instance,
+                                        "dequeue: removed %p from issue queue\n", cmd);
+                               return cmd;
+                       }
+               }
+       } else {
+               /* Autosense processing begins here */
+               ncmd = list_first_entry(&hostdata->autosense,
+                                       struct NCR5380_cmd, list);
+               list_del(&ncmd->list);
+               cmd = NCR5380_to_scmd(ncmd);
+               dsprintk(NDEBUG_QUEUES, instance,
+                        "dequeue: removed %p from autosense queue\n", cmd);
+               scsi_eh_prep_cmnd(cmd, &hostdata->ses, NULL, 0, ~0);
+               hostdata->sensing = cmd;
+               return cmd;
+       }
+       return NULL;
+}
+
+static void requeue_cmd(struct Scsi_Host *instance, struct scsi_cmnd *cmd)
+{
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
+       struct NCR5380_cmd *ncmd = scsi_cmd_priv(cmd);
+
+       if (hostdata->sensing) {
+               scsi_eh_restore_cmnd(cmd, &hostdata->ses);
+               list_add(&ncmd->list, &hostdata->autosense);
+               hostdata->sensing = NULL;
+       } else
+               list_add(&ncmd->list, &hostdata->unissued);
+}
+
 /**
  * NCR5380_main - NCR state machines
  *
@@ -931,8 +916,6 @@ static inline void maybe_release_dma_irq(struct Scsi_Host *instance)
  * be done on the NCR5380 host adapters in a system.  Both
  * NCR5380_queue_command() and NCR5380_intr() will try to start it
  * in case it is not running.
- *
- * Locks: called as its own thread with no locks held.
  */
 
 static void NCR5380_main(struct work_struct *work)
@@ -940,154 +923,69 @@ static void NCR5380_main(struct work_struct *work)
        struct NCR5380_hostdata *hostdata =
                container_of(work, struct NCR5380_hostdata, main_task);
        struct Scsi_Host *instance = hostdata->host;
-       struct scsi_cmnd *tmp, *prev;
+       struct scsi_cmnd *cmd;
        int done;
-       unsigned long flags;
 
        /*
-        * We run (with interrupts disabled) until we're sure that none of
-        * the host adapters have anything that can be done, at which point
-        * we set main_running to 0 and exit.
-        *
-        * Interrupts are enabled before doing various other internal
-        * instructions, after we've decided that we need to run through
-        * the loop again.
-        *
-        * this should prevent any race conditions.
-        *
         * ++roman: Just disabling the NCR interrupt isn't sufficient here,
         * because also a timer int can trigger an abort or reset, which can
         * alter queues and touch the Falcon lock.
         */
 
-       /* Tell int handlers main() is now already executing.  Note that
-          no races are possible here. If an int comes in before
-          'main_running' is set here, and queues/executes main via the
-          task queue, it doesn't do any harm, just this instance of main
-          won't find any work left to do. */
-       if (hostdata->main_running)
-               return;
-       hostdata->main_running = 1;
-
-       local_save_flags(flags);
        do {
-               local_irq_disable();    /* Freeze request queues */
                done = 1;
 
-               if (!hostdata->connected) {
-                       dprintk(NDEBUG_MAIN, "scsi%d: not connected\n", HOSTNO);
-                       /*
-                        * Search through the issue_queue for a command destined
-                        * for a target that's not busy.
-                        */
-#if (NDEBUG & NDEBUG_LISTS)
-                       for (tmp = (struct scsi_cmnd *) hostdata->issue_queue, prev = NULL;
-                            tmp && (tmp != prev); prev = tmp, tmp = NEXT(tmp))
-                               ;
-                       /*printk("%p  ", tmp);*/
-                       if ((tmp == prev) && tmp)
-                               printk(" LOOP\n");
-                       /* else printk("\n"); */
-#endif
-                       for (tmp = (struct scsi_cmnd *) hostdata->issue_queue,
-                            prev = NULL; tmp; prev = tmp, tmp = NEXT(tmp)) {
-                               u8 lun = tmp->device->lun;
-
-                               dprintk(NDEBUG_LISTS,
-                                       "MAIN tmp=%p target=%d busy=%d lun=%d\n",
-                                       tmp, scmd_id(tmp), hostdata->busy[scmd_id(tmp)],
-                                       lun);
-                               /*  When we find one, remove it from the issue queue. */
-                               /* ++guenther: possible race with Falcon locking */
-                               if (
-#ifdef SUPPORT_TAGS
-                                   !is_lun_busy( tmp, tmp->cmnd[0] != REQUEST_SENSE)
-#else
-                                   !(hostdata->busy[tmp->device->id] & (1 << lun))
-#endif
-                                   ) {
-                                       /* ++guenther: just to be sure, this must be atomic */
-                                       local_irq_disable();
-                                       if (prev) {
-                                               REMOVE(prev, NEXT(prev), tmp, NEXT(tmp));
-                                               SET_NEXT(prev, NEXT(tmp));
-                                       } else {
-                                               REMOVE(-1, hostdata->issue_queue, tmp, NEXT(tmp));
-                                               hostdata->issue_queue = NEXT(tmp);
-                                       }
-                                       SET_NEXT(tmp, NULL);
-                                       hostdata->retain_dma_intr++;
+               spin_lock_irq(&hostdata->lock);
+               while (!hostdata->connected &&
+                      (cmd = dequeue_next_cmd(instance))) {
 
-                                       /* reenable interrupts after finding one */
-                                       local_irq_restore(flags);
+                       dsprintk(NDEBUG_MAIN, instance, "main: dequeued %p\n", cmd);
 
-                                       /*
-                                        * Attempt to establish an I_T_L nexus here.
-                                        * On success, instance->hostdata->connected is set.
-                                        * On failure, we must add the command back to the
-                                        *   issue queue so we can keep trying.
-                                        */
-                                       dprintk(NDEBUG_MAIN, "scsi%d: main(): command for target %d "
-                                                   "lun %d removed from issue_queue\n",
-                                                   HOSTNO, tmp->device->id, lun);
-                                       /*
-                                        * REQUEST SENSE commands are issued without tagged
-                                        * queueing, even on SCSI-II devices because the
-                                        * contingent allegiance condition exists for the
-                                        * entire unit.
-                                        */
-                                       /* ++roman: ...and the standard also requires that
-                                        * REQUEST SENSE command are untagged.
-                                        */
+                       /*
+                        * Attempt to establish an I_T_L nexus here.
+                        * On success, instance->hostdata->connected is set.
+                        * On failure, we must add the command back to the
+                        * issue queue so we can keep trying.
+                        */
+                       /*
+                        * REQUEST SENSE commands are issued without tagged
+                        * queueing, even on SCSI-II devices because the
+                        * contingent allegiance condition exists for the
+                        * entire unit.
+                        */
+                       /* ++roman: ...and the standard also requires that
+                        * REQUEST SENSE command are untagged.
+                        */
 
 #ifdef SUPPORT_TAGS
-                                       cmd_get_tag(tmp, tmp->cmnd[0] != REQUEST_SENSE);
+                       cmd_get_tag(cmd, cmd->cmnd[0] != REQUEST_SENSE);
 #endif
-                                       if (!NCR5380_select(instance, tmp)) {
-                                               local_irq_disable();
-                                               hostdata->retain_dma_intr--;
-                                               /* release if target did not response! */
-                                               maybe_release_dma_irq(instance);
-                                               local_irq_restore(flags);
-                                               break;
-                                       } else {
-                                               local_irq_disable();
-                                               LIST(tmp, hostdata->issue_queue);
-                                               SET_NEXT(tmp, hostdata->issue_queue);
-                                               hostdata->issue_queue = tmp;
+                       cmd = NCR5380_select(instance, cmd);
+                       if (!cmd) {
+                               dsprintk(NDEBUG_MAIN, instance, "main: select complete\n");
+                               maybe_release_dma_irq(instance);
+                       } else {
+                               dsprintk(NDEBUG_MAIN | NDEBUG_QUEUES, instance,
+                                        "main: select failed, returning %p to queue\n", cmd);
+                               requeue_cmd(instance, cmd);
 #ifdef SUPPORT_TAGS
-                                               cmd_free_tag(tmp);
+                               cmd_free_tag(cmd);
 #endif
-                                               hostdata->retain_dma_intr--;
-                                               local_irq_restore(flags);
-                                               dprintk(NDEBUG_MAIN, "scsi%d: main(): select() failed, "
-                                                           "returned to issue_queue\n", HOSTNO);
-                                               if (hostdata->connected)
-                                                       break;
-                                       }
-                               } /* if target/lun/target queue is not busy */
-                       } /* for issue_queue */
-               } /* if (!hostdata->connected) */
-
+                       }
+               }
                if (hostdata->connected
 #ifdef REAL_DMA
                    && !hostdata->dma_len
 #endif
                    ) {
-                       local_irq_restore(flags);
-                       dprintk(NDEBUG_MAIN, "scsi%d: main: performing information transfer\n",
-                                   HOSTNO);
+                       dsprintk(NDEBUG_MAIN, instance, "main: performing information transfer\n");
                        NCR5380_information_transfer(instance);
-                       dprintk(NDEBUG_MAIN, "scsi%d: main: done set false\n", HOSTNO);
                        done = 0;
                }
+               spin_unlock_irq(&hostdata->lock);
+               if (!done)
+                       cond_resched();
        } while (!done);
-
-       /* Better allow ints _after_ 'main_running' has been cleared, else
-          an interrupt could believe we'll pick up the work it left for
-          us, but we won't see it anymore here... */
-       hostdata->main_running = 0;
-       local_irq_restore(flags);
 }
 
 
@@ -1096,27 +994,20 @@ static void NCR5380_main(struct work_struct *work)
  * Function : void NCR5380_dma_complete (struct Scsi_Host *instance)
  *
  * Purpose : Called by interrupt handler when DMA finishes or a phase
- *     mismatch occurs (which would finish the DMA transfer).
+ * mismatch occurs (which would finish the DMA transfer).
  *
  * Inputs : instance - this instance of the NCR5380.
- *
  */
 
 static void NCR5380_dma_complete(struct Scsi_Host *instance)
 {
-       SETUP_HOSTDATA(instance);
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
        int transferred;
        unsigned char **data;
-       volatile int *count;
+       int *count;
        int saved_data = 0, overrun = 0;
        unsigned char p;
 
-       if (!hostdata->connected) {
-               printk(KERN_WARNING "scsi%d: received end of DMA interrupt with "
-                      "no connected cmd\n", HOSTNO);
-               return;
-       }
-
        if (hostdata->read_overruns) {
                p = hostdata->connected->SCp.phase;
                if (p & SR_IO) {
@@ -1126,15 +1017,11 @@ static void NCR5380_dma_complete(struct Scsi_Host *instance)
                            (BASR_PHASE_MATCH|BASR_ACK)) {
                                saved_data = NCR5380_read(INPUT_DATA_REG);
                                overrun = 1;
-                               dprintk(NDEBUG_DMA, "scsi%d: read overrun handled\n", HOSTNO);
+                               dsprintk(NDEBUG_DMA, instance, "read overrun handled\n");
                        }
                }
        }
 
-       dprintk(NDEBUG_DMA, "scsi%d: real DMA transfer complete, basr 0x%X, sr 0x%X\n",
-                  HOSTNO, NCR5380_read(BUS_AND_STATUS_REG),
-                  NCR5380_read(STATUS_REG));
-
 #if defined(CONFIG_SUN3)
        if ((sun3scsi_dma_finish(rq_data_dir(hostdata->connected->request)))) {
                pr_err("scsi%d: overrun in UDC counter -- not prepared to deal with this!\n",
@@ -1153,9 +1040,9 @@ static void NCR5380_dma_complete(struct Scsi_Host *instance)
        }
 #endif
 
-       (void)NCR5380_read(RESET_PARITY_INTERRUPT_REG);
        NCR5380_write(MODE_REG, MR_BASE);
        NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
+       NCR5380_read(RESET_PARITY_INTERRUPT_REG);
 
        transferred = hostdata->dma_len - NCR5380_dma_residual(instance);
        hostdata->dma_len = 0;
@@ -1194,140 +1081,160 @@ static void NCR5380_dma_complete(struct Scsi_Host *instance)
  * Handle interrupts, reestablishing I_T_L or I_T_L_Q nexuses
  * from the disconnected queue, and restarting NCR5380_main()
  * as required.
+ *
+ * The chip can assert IRQ in any of six different conditions. The IRQ flag
+ * is then cleared by reading the Reset Parity/Interrupt Register (RPIR).
+ * Three of these six conditions are latched in the Bus and Status Register:
+ * - End of DMA (cleared by ending DMA Mode)
+ * - Parity error (cleared by reading RPIR)
+ * - Loss of BSY (cleared by reading RPIR)
+ * Two conditions have flag bits that are not latched:
+ * - Bus phase mismatch (non-maskable in DMA Mode, cleared by ending DMA Mode)
+ * - Bus reset (non-maskable)
+ * The remaining condition has no flag bit at all:
+ * - Selection/reselection
+ *
+ * Hence, establishing the cause(s) of any interrupt is partly guesswork.
+ * In "The DP8490 and DP5380 Comparison Guide", National Semiconductor
+ * claimed that "the design of the [DP8490] interrupt logic ensures
+ * interrupts will not be lost (they can be on the DP5380)."
+ * The L5380/53C80 datasheet from LOGIC Devices has more details.
+ *
+ * Checking for bus reset by reading RST is futile because of interrupt
+ * latency, but a bus reset will reset chip logic. Checking for parity error
+ * is unnecessary because that interrupt is never enabled. A Loss of BSY
+ * condition will clear DMA Mode. We can tell when this occurs because the
+ * the Busy Monitor interrupt is enabled together with DMA Mode.
  */
 
 static irqreturn_t NCR5380_intr(int irq, void *dev_id)
 {
        struct Scsi_Host *instance = dev_id;
-       int done = 1, handled = 0;
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
+       int handled = 0;
        unsigned char basr;
+       unsigned long flags;
 
-       dprintk(NDEBUG_INTR, "scsi%d: NCR5380 irq triggered\n", HOSTNO);
+       spin_lock_irqsave(&hostdata->lock, flags);
 
-       /* Look for pending interrupts */
        basr = NCR5380_read(BUS_AND_STATUS_REG);
-       dprintk(NDEBUG_INTR, "scsi%d: BASR=%02x\n", HOSTNO, basr);
-       /* dispatch to appropriate routine if found and done=0 */
        if (basr & BASR_IRQ) {
-               NCR5380_dprint(NDEBUG_INTR, instance);
-               if ((NCR5380_read(STATUS_REG) & (SR_SEL|SR_IO)) == (SR_SEL|SR_IO)) {
-                       done = 0;
-                       dprintk(NDEBUG_INTR, "scsi%d: SEL interrupt\n", HOSTNO);
-                       NCR5380_reselect(instance);
-                       (void)NCR5380_read(RESET_PARITY_INTERRUPT_REG);
-               } else if (basr & BASR_PARITY_ERROR) {
-                       dprintk(NDEBUG_INTR, "scsi%d: PARITY interrupt\n", HOSTNO);
-                       (void)NCR5380_read(RESET_PARITY_INTERRUPT_REG);
-               } else if ((NCR5380_read(STATUS_REG) & SR_RST) == SR_RST) {
-                       dprintk(NDEBUG_INTR, "scsi%d: RESET interrupt\n", HOSTNO);
-                       (void)NCR5380_read(RESET_PARITY_INTERRUPT_REG);
-               } else {
-                       /*
-                        * The rest of the interrupt conditions can occur only during a
-                        * DMA transfer
-                        */
+               unsigned char mr = NCR5380_read(MODE_REG);
+               unsigned char sr = NCR5380_read(STATUS_REG);
+
+               dsprintk(NDEBUG_INTR, instance, "IRQ %d, BASR 0x%02x, SR 0x%02x, MR 0x%02x\n",
+                        irq, basr, sr, mr);
 
 #if defined(REAL_DMA)
-                       /*
-                        * We should only get PHASE MISMATCH and EOP interrupts if we have
-                        * DMA enabled, so do a sanity check based on the current setting
-                        * of the MODE register.
+               if ((mr & MR_DMA_MODE) || (mr & MR_MONITOR_BSY)) {
+                       /* Probably End of DMA, Phase Mismatch or Loss of BSY.
+                        * We ack IRQ after clearing Mode Register. Workarounds
+                        * for End of DMA errata need to happen in DMA Mode.
                         */
 
-                       if ((NCR5380_read(MODE_REG) & MR_DMA_MODE) &&
-                           ((basr & BASR_END_DMA_TRANSFER) ||
-                            !(basr & BASR_PHASE_MATCH))) {
+                       dsprintk(NDEBUG_INTR, instance, "interrupt in DMA mode\n");
 
-                               dprintk(NDEBUG_INTR, "scsi%d: PHASE MISM or EOP interrupt\n", HOSTNO);
-                               NCR5380_dma_complete( instance );
-                               done = 0;
-                       } else
+                       if (hostdata->connected) {
+                               NCR5380_dma_complete(instance);
+                               queue_work(hostdata->work_q, &hostdata->main_task);
+                       } else {
+                               NCR5380_write(MODE_REG, MR_BASE);
+                               NCR5380_read(RESET_PARITY_INTERRUPT_REG);
+                       }
+               } else
 #endif /* REAL_DMA */
-                       {
-/* MS: Ignore unknown phase mismatch interrupts (caused by EOP interrupt) */
-                               if (basr & BASR_PHASE_MATCH)
-                                       dprintk(NDEBUG_INTR, "scsi%d: unknown interrupt, "
-                                              "BASR 0x%x, MR 0x%x, SR 0x%x\n",
-                                              HOSTNO, basr, NCR5380_read(MODE_REG),
-                                              NCR5380_read(STATUS_REG));
-                               (void)NCR5380_read(RESET_PARITY_INTERRUPT_REG);
+               if ((NCR5380_read(CURRENT_SCSI_DATA_REG) & hostdata->id_mask) &&
+                   (sr & (SR_SEL | SR_IO | SR_BSY | SR_RST)) == (SR_SEL | SR_IO)) {
+                       /* Probably reselected */
+                       NCR5380_write(SELECT_ENABLE_REG, 0);
+                       NCR5380_read(RESET_PARITY_INTERRUPT_REG);
+
+                       dsprintk(NDEBUG_INTR, instance, "interrupt with SEL and IO\n");
+
+                       if (!hostdata->connected) {
+                               NCR5380_reselect(instance);
+                               queue_work(hostdata->work_q, &hostdata->main_task);
+                       }
+                       if (!hostdata->connected)
+                               NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
+               } else {
+                       /* Probably Bus Reset */
+                       NCR5380_read(RESET_PARITY_INTERRUPT_REG);
+
+                       dsprintk(NDEBUG_INTR, instance, "unknown interrupt\n");
 #ifdef SUN3_SCSI_VME
-                               dregs->csr |= CSR_DMA_ENABLE;
+                       dregs->csr |= CSR_DMA_ENABLE;
 #endif
-                       }
-               } /* if !(SELECTION || PARITY) */
+               }
                handled = 1;
-       } /* BASR & IRQ */ else {
-               printk(KERN_NOTICE "scsi%d: interrupt without IRQ bit set in BASR, "
-                      "BASR 0x%X, MR 0x%X, SR 0x%x\n", HOSTNO, basr,
-                      NCR5380_read(MODE_REG), NCR5380_read(STATUS_REG));
-               (void)NCR5380_read(RESET_PARITY_INTERRUPT_REG);
+       } else {
+               shost_printk(KERN_NOTICE, instance, "interrupt without IRQ bit\n");
 #ifdef SUN3_SCSI_VME
                dregs->csr |= CSR_DMA_ENABLE;
 #endif
        }
 
-       if (!done) {
-               dprintk(NDEBUG_INTR, "scsi%d: in int routine, calling main\n", HOSTNO);
-               /* Put a call to NCR5380_main() on the queue... */
-               queue_main(shost_priv(instance));
-       }
+       spin_unlock_irqrestore(&hostdata->lock, flags);
+
        return IRQ_RETVAL(handled);
 }
 
 /*
  * Function : int NCR5380_select(struct Scsi_Host *instance,
- *                               struct scsi_cmnd *cmd)
+ * struct scsi_cmnd *cmd)
  *
  * Purpose : establishes I_T_L or I_T_L_Q nexus for new or existing command,
- *     including ARBITRATION, SELECTION, and initial message out for
- *     IDENTIFY and queue messages.
+ * including ARBITRATION, SELECTION, and initial message out for
+ * IDENTIFY and queue messages.
  *
  * Inputs : instance - instantiation of the 5380 driver on which this
- *     target lives, cmd - SCSI command to execute.
+ * target lives, cmd - SCSI command to execute.
  *
- * Returns : -1 if selection could not execute for some reason,
- *     0 if selection succeeded or failed because the target
- *     did not respond.
+ * Returns cmd if selection failed but should be retried,
+ * NULL if selection failed and should not be retried, or
+ * NULL if selection succeeded (hostdata->connected == cmd).
  *
  * Side effects :
- *     If bus busy, arbitration failed, etc, NCR5380_select() will exit
- *             with registers as they should have been on entry - ie
- *             SELECT_ENABLE will be set appropriately, the NCR5380
- *             will cease to drive any SCSI bus signals.
+ * If bus busy, arbitration failed, etc, NCR5380_select() will exit
+ * with registers as they should have been on entry - ie
+ * SELECT_ENABLE will be set appropriately, the NCR5380
+ * will cease to drive any SCSI bus signals.
  *
- *     If successful : I_T_L or I_T_L_Q nexus will be established,
- *             instance->connected will be set to cmd.
- *             SELECT interrupt will be disabled.
+ * If successful : I_T_L or I_T_L_Q nexus will be established,
+ * instance->connected will be set to cmd.
+ * SELECT interrupt will be disabled.
  *
- *     If failed (no target) : cmd->scsi_done() will be called, and the
- *             cmd->result host byte set to DID_BAD_TARGET.
+ * If failed (no target) : cmd->scsi_done() will be called, and the
+ * cmd->result host byte set to DID_BAD_TARGET.
  */
 
-static int NCR5380_select(struct Scsi_Host *instance, struct scsi_cmnd *cmd)
+static struct scsi_cmnd *NCR5380_select(struct Scsi_Host *instance,
+                                        struct scsi_cmnd *cmd)
 {
-       SETUP_HOSTDATA(instance);
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
        unsigned char tmp[3], phase;
        unsigned char *data;
        int len;
-       unsigned long timeout;
-       unsigned long flags;
+       int err;
 
-       hostdata->restart_select = 0;
        NCR5380_dprint(NDEBUG_ARBITRATION, instance);
-       dprintk(NDEBUG_ARBITRATION, "scsi%d: starting arbitration, id = %d\n", HOSTNO,
-                  instance->this_id);
+       dsprintk(NDEBUG_ARBITRATION, instance, "starting arbitration, id = %d\n",
+                instance->this_id);
+
+       /*
+        * Arbitration and selection phases are slow and involve dropping the
+        * lock, so we have to watch out for EH. An exception handler may
+        * change 'selecting' to NULL. This function will then return NULL
+        * so that the caller will forget about 'cmd'. (During information
+        * transfer phases, EH may change 'connected' to NULL.)
+        */
+       hostdata->selecting = cmd;
 
        /*
         * Set the phase bits to 0, otherwise the NCR5380 won't drive the
         * data bus during SELECTION.
         */
 
-       local_irq_save(flags);
-       if (hostdata->connected) {
-               local_irq_restore(flags);
-               return -1;
-       }
        NCR5380_write(TARGET_COMMAND_REG, 0);
 
        /*
@@ -1337,96 +1244,77 @@ static int NCR5380_select(struct Scsi_Host *instance, struct scsi_cmnd *cmd)
        NCR5380_write(OUTPUT_DATA_REG, hostdata->id_mask);
        NCR5380_write(MODE_REG, MR_ARBITRATE);
 
-       local_irq_restore(flags);
-
-       /* Wait for arbitration logic to complete */
-#if defined(NCR_TIMEOUT)
-       {
-               unsigned long timeout = jiffies + 2*NCR_TIMEOUT;
+       /* The chip now waits for BUS FREE phase. Then after the 800 ns
+        * Bus Free Delay, arbitration will begin.
+        */
 
-               while (!(NCR5380_read(INITIATOR_COMMAND_REG) & ICR_ARBITRATION_PROGRESS) &&
-                      time_before(jiffies, timeout) && !hostdata->connected)
-                       ;
-               if (time_after_eq(jiffies, timeout)) {
-                       printk("scsi : arbitration timeout at %d\n", __LINE__);
-                       NCR5380_write(MODE_REG, MR_BASE);
-                       NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
-                       return -1;
-               }
+       spin_unlock_irq(&hostdata->lock);
+       err = NCR5380_poll_politely2(instance, MODE_REG, MR_ARBITRATE, 0,
+                       INITIATOR_COMMAND_REG, ICR_ARBITRATION_PROGRESS,
+                                              ICR_ARBITRATION_PROGRESS, HZ);
+       spin_lock_irq(&hostdata->lock);
+       if (!(NCR5380_read(MODE_REG) & MR_ARBITRATE)) {
+               /* Reselection interrupt */
+               goto out;
        }
-#else /* NCR_TIMEOUT */
-       while (!(NCR5380_read(INITIATOR_COMMAND_REG) & ICR_ARBITRATION_PROGRESS) &&
-              !hostdata->connected)
-               ;
-#endif
-
-       dprintk(NDEBUG_ARBITRATION, "scsi%d: arbitration complete\n", HOSTNO);
-
-       if (hostdata->connected) {
+       if (err < 0) {
                NCR5380_write(MODE_REG, MR_BASE);
-               return -1;
+               shost_printk(KERN_ERR, instance,
+                            "select: arbitration timeout\n");
+               goto out;
        }
-       /*
-        * The arbitration delay is 2.2us, but this is a minimum and there is
-        * no maximum so we can safely sleep for ceil(2.2) usecs to accommodate
-        * the integral nature of udelay().
-        *
-        */
+       spin_unlock_irq(&hostdata->lock);
 
+       /* The SCSI-2 arbitration delay is 2.4 us */
        udelay(3);
 
        /* Check for lost arbitration */
        if ((NCR5380_read(INITIATOR_COMMAND_REG) & ICR_ARBITRATION_LOST) ||
            (NCR5380_read(CURRENT_SCSI_DATA_REG) & hostdata->id_higher_mask) ||
-           (NCR5380_read(INITIATOR_COMMAND_REG) & ICR_ARBITRATION_LOST) ||
-           hostdata->connected) {
+           (NCR5380_read(INITIATOR_COMMAND_REG) & ICR_ARBITRATION_LOST)) {
                NCR5380_write(MODE_REG, MR_BASE);
-               dprintk(NDEBUG_ARBITRATION, "scsi%d: lost arbitration, deasserting MR_ARBITRATE\n",
-                          HOSTNO);
-               return -1;
+               dsprintk(NDEBUG_ARBITRATION, instance, "lost arbitration, deasserting MR_ARBITRATE\n");
+               spin_lock_irq(&hostdata->lock);
+               goto out;
        }
 
-       /* after/during arbitration, BSY should be asserted.
-          IBM DPES-31080 Version S31Q works now */
-       /* Tnx to Thomas_Roesch@m2.maus.de for finding this! (Roman) */
+       /* After/during arbitration, BSY should be asserted.
+        * IBM DPES-31080 Version S31Q works now
+        * Tnx to Thomas_Roesch@m2.maus.de for finding this! (Roman)
+        */
        NCR5380_write(INITIATOR_COMMAND_REG,
                      ICR_BASE | ICR_ASSERT_SEL | ICR_ASSERT_BSY);
 
-       if ((NCR5380_read(INITIATOR_COMMAND_REG) & ICR_ARBITRATION_LOST) ||
-           hostdata->connected) {
-               NCR5380_write(MODE_REG, MR_BASE);
-               NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-               dprintk(NDEBUG_ARBITRATION, "scsi%d: lost arbitration, deasserting ICR_ASSERT_SEL\n",
-                          HOSTNO);
-               return -1;
-       }
-
        /*
         * Again, bus clear + bus settle time is 1.2us, however, this is
         * a minimum so we'll udelay ceil(1.2)
         */
 
-#ifdef CONFIG_ATARI_SCSI_TOSHIBA_DELAY
-       /* ++roman: But some targets (see above :-) seem to need a bit more... */
-       udelay(15);
-#else
-       udelay(2);
-#endif
+       if (hostdata->flags & FLAG_TOSHIBA_DELAY)
+               udelay(15);
+       else
+               udelay(2);
 
-       if (hostdata->connected) {
+       spin_lock_irq(&hostdata->lock);
+
+       /* NCR5380_reselect() clears MODE_REG after a reselection interrupt */
+       if (!(NCR5380_read(MODE_REG) & MR_ARBITRATE))
+               goto out;
+
+       if (!hostdata->selecting) {
                NCR5380_write(MODE_REG, MR_BASE);
                NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-               return -1;
+               goto out;
        }
 
-       dprintk(NDEBUG_ARBITRATION, "scsi%d: won arbitration\n", HOSTNO);
+       dsprintk(NDEBUG_ARBITRATION, instance, "won arbitration\n");
 
        /*
         * Now that we have won arbitration, start Selection process, asserting
         * the host and target ID's on the SCSI bus.
         */
 
-       NCR5380_write(OUTPUT_DATA_REG, (hostdata->id_mask | (1 << cmd->device->id)));
+       NCR5380_write(OUTPUT_DATA_REG, hostdata->id_mask | (1 << scmd_id(cmd)));
 
        /*
         * Raise ATN while SEL is true before BSY goes false from arbitration,
@@ -1434,22 +1322,18 @@ static int NCR5380_select(struct Scsi_Host *instance, struct scsi_cmnd *cmd)
         * phase immediately after selection.
         */
 
-       NCR5380_write(INITIATOR_COMMAND_REG, (ICR_BASE | ICR_ASSERT_BSY |
-                     ICR_ASSERT_DATA | ICR_ASSERT_ATN | ICR_ASSERT_SEL ));
+       NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_BSY |
+                     ICR_ASSERT_DATA | ICR_ASSERT_ATN | ICR_ASSERT_SEL);
        NCR5380_write(MODE_REG, MR_BASE);
 
        /*
         * Reselect interrupts must be turned off prior to the dropping of BSY,
         * otherwise we will trigger an interrupt.
         */
-
-       if (hostdata->connected) {
-               NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-               return -1;
-       }
-
        NCR5380_write(SELECT_ENABLE_REG, 0);
 
+       spin_unlock_irq(&hostdata->lock);
+
        /*
         * The initiator shall then wait at least two deskew delays and release
         * the BSY signal.
@@ -1457,8 +1341,8 @@ static int NCR5380_select(struct Scsi_Host *instance, struct scsi_cmnd *cmd)
        udelay(1);        /* wingel -- wait two bus deskew delay >2*45ns */
 
        /* Reset BSY */
-       NCR5380_write(INITIATOR_COMMAND_REG, (ICR_BASE | ICR_ASSERT_DATA |
-                     ICR_ASSERT_ATN | ICR_ASSERT_SEL));
+       NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_DATA |
+                     ICR_ASSERT_ATN | ICR_ASSERT_SEL);
 
        /*
         * Something weird happens when we cease to drive BSY - looks
@@ -1479,45 +1363,39 @@ static int NCR5380_select(struct Scsi_Host *instance, struct scsi_cmnd *cmd)
 
        udelay(1);
 
-       dprintk(NDEBUG_SELECTION, "scsi%d: selecting target %d\n", HOSTNO, cmd->device->id);
+       dsprintk(NDEBUG_SELECTION, instance, "selecting target %d\n", scmd_id(cmd));
 
        /*
         * The SCSI specification calls for a 250 ms timeout for the actual
         * selection.
         */
 
-       timeout = jiffies + msecs_to_jiffies(250);
-
-       /*
-        * XXX very interesting - we're seeing a bounce where the BSY we
-        * asserted is being reflected / still asserted (propagation delay?)
-        * and it's detecting as true.  Sigh.
-        */
-
-#if 0
-       /* ++roman: If a target conformed to the SCSI standard, it wouldn't assert
-        * IO while SEL is true. But again, there are some disks out the in the
-        * world that do that nevertheless. (Somebody claimed that this announces
-        * reselection capability of the target.) So we better skip that test and
-        * only wait for BSY... (Famous german words: Der Klügere gibt nach :-)
-        */
-
-       while (time_before(jiffies, timeout) &&
-              !(NCR5380_read(STATUS_REG) & (SR_BSY | SR_IO)))
-               ;
+       err = NCR5380_poll_politely(instance, STATUS_REG, SR_BSY, SR_BSY,
+                                   msecs_to_jiffies(250));
 
        if ((NCR5380_read(STATUS_REG) & (SR_SEL | SR_IO)) == (SR_SEL | SR_IO)) {
+               spin_lock_irq(&hostdata->lock);
                NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
                NCR5380_reselect(instance);
-               printk(KERN_ERR "scsi%d: reselection after won arbitration?\n",
-                      HOSTNO);
+               if (!hostdata->connected)
+                       NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
+               shost_printk(KERN_ERR, instance, "reselection after won arbitration?\n");
+               goto out;
+       }
+
+       if (err < 0) {
+               spin_lock_irq(&hostdata->lock);
+               NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
                NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
-               return -1;
+               /* Can't touch cmd if it has been reclaimed by the scsi ML */
+               if (hostdata->selecting) {
+                       cmd->result = DID_BAD_TARGET << 16;
+                       complete_cmd(instance, cmd);
+                       dsprintk(NDEBUG_SELECTION, instance, "target did not respond within 250ms\n");
+                       cmd = NULL;
+               }
+               goto out;
        }
-#else
-       while (time_before(jiffies, timeout) && !(NCR5380_read(STATUS_REG) & SR_BSY))
-               ;
-#endif
 
        /*
         * No less than two deskew delays after the initiator detects the
@@ -1525,32 +1403,9 @@ static int NCR5380_select(struct Scsi_Host *instance, struct scsi_cmnd *cmd)
         * change the DATA BUS.                                     -wingel
         */
 
-       udelay(1);
-
-       NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN);
-
-       if (!(NCR5380_read(STATUS_REG) & SR_BSY)) {
-               NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-               if (hostdata->targets_present & (1 << cmd->device->id)) {
-                       printk(KERN_ERR "scsi%d: weirdness\n", HOSTNO);
-                       if (hostdata->restart_select)
-                               printk(KERN_NOTICE "\trestart select\n");
-                       NCR5380_dprint(NDEBUG_ANY, instance);
-                       NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
-                       return -1;
-               }
-               cmd->result = DID_BAD_TARGET << 16;
-#ifdef SUPPORT_TAGS
-               cmd_free_tag(cmd);
-#endif
-               cmd->scsi_done(cmd);
-               NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
-               dprintk(NDEBUG_SELECTION, "scsi%d: target did not respond within 250ms\n", HOSTNO);
-               NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
-               return 0;
-       }
-
-       hostdata->targets_present |= (1 << cmd->device->id);
+       udelay(1);
+
+       NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN);
 
        /*
         * Since we followed the SCSI spec, and raised ATN while SEL
@@ -1563,16 +1418,27 @@ static int NCR5380_select(struct Scsi_Host *instance, struct scsi_cmnd *cmd)
         * until it wraps back to 0.
         *
         * XXX - it turns out that there are some broken SCSI-II devices,
-        *           which claim to support tagged queuing but fail when more than
-        *           some number of commands are issued at once.
+        * which claim to support tagged queuing but fail when more than
+        * some number of commands are issued at once.
         */
 
        /* Wait for start of REQ/ACK handshake */
-       while (!(NCR5380_read(STATUS_REG) & SR_REQ))
-               ;
 
-       dprintk(NDEBUG_SELECTION, "scsi%d: target %d selected, going into MESSAGE OUT phase.\n",
-                  HOSTNO, cmd->device->id);
+       err = NCR5380_poll_politely(instance, STATUS_REG, SR_REQ, SR_REQ, HZ);
+       spin_lock_irq(&hostdata->lock);
+       if (err < 0) {
+               shost_printk(KERN_ERR, instance, "select: REQ timeout\n");
+               NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
+               NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
+               goto out;
+       }
+       if (!hostdata->selecting) {
+               do_abort(instance);
+               goto out;
+       }
+
+       dsprintk(NDEBUG_SELECTION, instance, "target %d selected, going into MESSAGE OUT phase.\n",
+                scmd_id(cmd));
        tmp[0] = IDENTIFY(1, cmd->device->lun);
 
 #ifdef SUPPORT_TAGS
@@ -1591,11 +1457,12 @@ static int NCR5380_select(struct Scsi_Host *instance, struct scsi_cmnd *cmd)
        data = tmp;
        phase = PHASE_MSGOUT;
        NCR5380_transfer_pio(instance, &phase, &len, &data);
-       dprintk(NDEBUG_SELECTION, "scsi%d: nexus established.\n", HOSTNO);
+       dsprintk(NDEBUG_SELECTION, instance, "nexus established.\n");
        /* XXX need to handle errors here */
+
        hostdata->connected = cmd;
 #ifndef SUPPORT_TAGS
-       hostdata->busy[cmd->device->id] |= (1 << cmd->device->lun);
+       hostdata->busy[cmd->device->id] |= 1 << cmd->device->lun;
 #endif
 #ifdef SUN3_SCSI_VME
        dregs->csr |= CSR_INTR;
@@ -1603,24 +1470,30 @@ static int NCR5380_select(struct Scsi_Host *instance, struct scsi_cmnd *cmd)
 
        initialize_SCp(cmd);
 
-       return 0;
+       cmd = NULL;
+
+out:
+       if (!hostdata->selecting)
+               return NULL;
+       hostdata->selecting = NULL;
+       return cmd;
 }
 
 /*
  * Function : int NCR5380_transfer_pio (struct Scsi_Host *instance,
- *      unsigned char *phase, int *count, unsigned char **data)
+ * unsigned char *phase, int *count, unsigned char **data)
  *
  * Purpose : transfers data in given phase using polled I/O
  *
  * Inputs : instance - instance of driver, *phase - pointer to
- *     what phase is expected, *count - pointer to number of
- *     bytes to transfer, **data - pointer to data pointer.
+ * what phase is expected, *count - pointer to number of
+ * bytes to transfer, **data - pointer to data pointer.
  *
  * Returns : -1 when different phase is entered without transferring
- *     maximum number of bytes, 0 if all bytes are transferred or exit
- *     is in same phase.
+ * maximum number of bytes, 0 if all bytes are transferred or exit
+ * is in same phase.
  *
- *     Also, *phase, *count, *data are modified in place.
+ * Also, *phase, *count, *data are modified in place.
  *
  * XXX Note : handling for bus free may be useful.
  */
@@ -1635,9 +1508,9 @@ static int NCR5380_transfer_pio(struct Scsi_Host *instance,
                                unsigned char *phase, int *count,
                                unsigned char **data)
 {
-       register unsigned char p = *phase, tmp;
-       register int c = *count;
-       register unsigned char *d = *data;
+       unsigned char p = *phase, tmp;
+       int c = *count;
+       unsigned char *d = *data;
 
        /*
         * The NCR5380 chip will only drive the SCSI bus when the
@@ -1652,14 +1525,15 @@ static int NCR5380_transfer_pio(struct Scsi_Host *instance,
                 * Wait for assertion of REQ, after which the phase bits will be
                 * valid
                 */
-               while (!((tmp = NCR5380_read(STATUS_REG)) & SR_REQ))
-                       ;
 
-               dprintk(NDEBUG_HANDSHAKE, "scsi%d: REQ detected\n", HOSTNO);
+               if (NCR5380_poll_politely(instance, STATUS_REG, SR_REQ, SR_REQ, HZ) < 0)
+                       break;
+
+               dsprintk(NDEBUG_HANDSHAKE, instance, "REQ asserted\n");
 
                /* Check for phase mismatch */
-               if ((tmp & PHASE_MASK) != p) {
-                       dprintk(NDEBUG_PIO, "scsi%d: phase mismatch\n", HOSTNO);
+               if ((NCR5380_read(STATUS_REG) & PHASE_MASK) != p) {
+                       dsprintk(NDEBUG_PIO, instance, "phase mismatch\n");
                        NCR5380_dprint_phase(NDEBUG_PIO, instance);
                        break;
                }
@@ -1684,35 +1558,36 @@ static int NCR5380_transfer_pio(struct Scsi_Host *instance,
                                NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_DATA);
                                NCR5380_dprint(NDEBUG_PIO, instance);
                                NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE |
-                                             ICR_ASSERT_DATA | ICR_ASSERT_ACK);
+                                             ICR_ASSERT_DATA | ICR_ASSERT_ACK);
                        } else {
                                NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE |
-                                             ICR_ASSERT_DATA | ICR_ASSERT_ATN);
+                                             ICR_ASSERT_DATA | ICR_ASSERT_ATN);
                                NCR5380_dprint(NDEBUG_PIO, instance);
                                NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE |
-                                             ICR_ASSERT_DATA | ICR_ASSERT_ATN | ICR_ASSERT_ACK);
+                                             ICR_ASSERT_DATA | ICR_ASSERT_ATN | ICR_ASSERT_ACK);
                        }
                } else {
                        NCR5380_dprint(NDEBUG_PIO, instance);
                        NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ACK);
                }
 
-               while (NCR5380_read(STATUS_REG) & SR_REQ)
-                       ;
+               if (NCR5380_poll_politely(instance,
+                                         STATUS_REG, SR_REQ, 0, 5 * HZ) < 0)
+                       break;
 
-               dprintk(NDEBUG_HANDSHAKE, "scsi%d: req false, handshake complete\n", HOSTNO);
+               dsprintk(NDEBUG_HANDSHAKE, instance, "REQ negated, handshake complete\n");
 
-               /*
               * We have several special cases to consider during REQ/ACK handshaking :
               * 1.  We were in MSGOUT phase, and we are on the last byte of the
               *      message.  ATN must be dropped as ACK is dropped.
               *
               * 2.  We are in a MSGIN phase, and we are on the last byte of the
               *      message.  We must exit with ACK asserted, so that the calling
               *      code may raise ATN before dropping ACK to reject the message.
               *
               * 3.  ACK and ATN are clear and the target may proceed as normal.
               */
+/*
+ * We have several special cases to consider during REQ/ACK handshaking :
+ * 1.  We were in MSGOUT phase, and we are on the last byte of the
* message.  ATN must be dropped as ACK is dropped.
+ *
+ * 2.  We are in a MSGIN phase, and we are on the last byte of the
* message.  We must exit with ACK asserted, so that the calling
* code may raise ATN before dropping ACK to reject the message.
+ *
+ * 3.  ACK and ATN are clear and the target may proceed as normal.
+ */
                if (!(p == PHASE_MSGIN && c == 1)) {
                        if (p == PHASE_MSGOUT && c > 1)
                                NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN);
@@ -1721,16 +1596,16 @@ static int NCR5380_transfer_pio(struct Scsi_Host *instance,
                }
        } while (--c);
 
-       dprintk(NDEBUG_PIO, "scsi%d: residual %d\n", HOSTNO, c);
+       dsprintk(NDEBUG_PIO, instance, "residual %d\n", c);
 
        *count = c;
        *data = d;
        tmp = NCR5380_read(STATUS_REG);
        /* The phase read from the bus is valid if either REQ is (already)
-        * asserted or if ACK hasn't been released yet. The latter is the case if
-        * we're in MSGIN and all wanted bytes have been received.
+        * asserted or if ACK hasn't been released yet. The latter applies if
+        * we're in MSG IN, DATA IN or STATUS and all bytes have been received.
         */
-       if ((tmp & SR_REQ) || (p == PHASE_MSGIN && c == 0))
+       if ((tmp & SR_REQ) || ((tmp & SR_IO) && c == 0))
                *phase = tmp & PHASE_MASK;
        else
                *phase = PHASE_UNKNOWN;
@@ -1741,19 +1616,45 @@ static int NCR5380_transfer_pio(struct Scsi_Host *instance,
                return -1;
 }
 
-/*
- * Function : do_abort (Scsi_Host *host)
+/**
+ * do_reset - issue a reset command
+ * @instance: adapter to reset
+ *
+ * Issue a reset sequence to the NCR5380 and try and get the bus
+ * back into sane shape.
  *
- * Purpose : abort the currently established nexus.  Should only be
- *     called from a routine which can drop into a
+ * This clears the reset interrupt flag because there may be no handler for
+ * it. When the driver is initialized, the NCR5380_intr() handler has not yet
+ * been installed. And when in EH we may have released the ST DMA interrupt.
+ */
+
+static void do_reset(struct Scsi_Host *instance)
+{
+       unsigned long flags;
+
+       local_irq_save(flags);
+       NCR5380_write(TARGET_COMMAND_REG,
+                     PHASE_SR_TO_TCR(NCR5380_read(STATUS_REG) & PHASE_MASK));
+       NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_RST);
+       udelay(50);
+       NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
+       (void)NCR5380_read(RESET_PARITY_INTERRUPT_REG);
+       local_irq_restore(flags);
+}
+
+/**
+ * do_abort - abort the currently established nexus by going to
+ * MESSAGE OUT phase and sending an ABORT message.
+ * @instance: relevant scsi host instance
  *
- * Returns 0 on success, -1 on failure.
+ * Returns 0 on success, -1 on failure.
  */
 
 static int do_abort(struct Scsi_Host *instance)
 {
-       unsigned char tmp, *msgptr, phase;
+       unsigned char *msgptr, phase, tmp;
        int len;
+       int rc;
 
        /* Request message out phase */
        NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN);
@@ -1768,16 +1669,20 @@ static int do_abort(struct Scsi_Host *instance)
         * the target sees, so we just handshake.
         */
 
-       while (!((tmp = NCR5380_read(STATUS_REG)) & SR_REQ))
-               ;
+       rc = NCR5380_poll_politely(instance, STATUS_REG, SR_REQ, SR_REQ, 10 * HZ);
+       if (rc < 0)
+               goto timeout;
+
+       tmp = NCR5380_read(STATUS_REG) & PHASE_MASK;
 
        NCR5380_write(TARGET_COMMAND_REG, PHASE_SR_TO_TCR(tmp));
 
-       if ((tmp & PHASE_MASK) != PHASE_MSGOUT) {
-               NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN |
-                             ICR_ASSERT_ACK);
-               while (NCR5380_read(STATUS_REG) & SR_REQ)
-                       ;
+       if (tmp != PHASE_MSGOUT) {
+               NCR5380_write(INITIATOR_COMMAND_REG,
+                             ICR_BASE | ICR_ASSERT_ATN | ICR_ASSERT_ACK);
+               rc = NCR5380_poll_politely(instance, STATUS_REG, SR_REQ, 0, 3 * HZ);
+               if (rc < 0)
+                       goto timeout;
                NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN);
        }
 
@@ -1793,26 +1698,29 @@ static int do_abort(struct Scsi_Host *instance)
         */
 
        return len ? -1 : 0;
+
+timeout:
+       NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
+       return -1;
 }
 
 #if defined(REAL_DMA)
 /*
  * Function : int NCR5380_transfer_dma (struct Scsi_Host *instance,
- *      unsigned char *phase, int *count, unsigned char **data)
+ * unsigned char *phase, int *count, unsigned char **data)
  *
  * Purpose : transfers data in given phase using either real
- *     or pseudo DMA.
+ * or pseudo DMA.
  *
  * Inputs : instance - instance of driver, *phase - pointer to
- *     what phase is expected, *count - pointer to number of
- *     bytes to transfer, **data - pointer to data pointer.
+ * what phase is expected, *count - pointer to number of
+ * bytes to transfer, **data - pointer to data pointer.
  *
  * Returns : -1 when different phase is entered without transferring
- *     maximum number of bytes, 0 if all bytes or transferred or exit
- *     is in same phase.
- *
- *     Also, *phase, *count, *data are modified in place.
+ * maximum number of bytes, 0 if all bytes or transferred or exit
+ * is in same phase.
  *
+ * Also, *phase, *count, *data are modified in place.
  */
 
 
@@ -1820,10 +1728,9 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance,
                                unsigned char *phase, int *count,
                                unsigned char **data)
 {
-       SETUP_HOSTDATA(instance);
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
        register int c = *count;
        register unsigned char p = *phase;
-       unsigned long flags;
 
 #if defined(CONFIG_SUN3)
        /* sanity check */
@@ -1834,29 +1741,22 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance,
        }
        hostdata->dma_len = c;
 
-       dprintk(NDEBUG_DMA, "scsi%d: initializing DMA for %s, %d bytes %s %p\n",
-               instance->host_no, (p & SR_IO) ? "reading" : "writing",
-               c, (p & SR_IO) ? "to" : "from", *data);
+       dsprintk(NDEBUG_DMA, instance, "initializing DMA %s: length %d, address %p\n",
+                (p & SR_IO) ? "receive" : "send", c, *data);
 
        /* netbsd turns off ints here, why not be safe and do it too */
-       local_irq_save(flags);
 
        /* send start chain */
        sun3scsi_dma_start(c, *data);
 
+       NCR5380_write(TARGET_COMMAND_REG, PHASE_SR_TO_TCR(p));
+       NCR5380_write(MODE_REG, MR_BASE | MR_DMA_MODE | MR_MONITOR_BSY |
+                               MR_ENABLE_EOP_INTR);
        if (p & SR_IO) {
-               NCR5380_write(TARGET_COMMAND_REG, 1);
-               NCR5380_read(RESET_PARITY_INTERRUPT_REG);
                NCR5380_write(INITIATOR_COMMAND_REG, 0);
-               NCR5380_write(MODE_REG,
-                             (NCR5380_read(MODE_REG) | MR_DMA_MODE | MR_ENABLE_EOP_INTR));
                NCR5380_write(START_DMA_INITIATOR_RECEIVE_REG, 0);
        } else {
-               NCR5380_write(TARGET_COMMAND_REG, 0);
-               NCR5380_read(RESET_PARITY_INTERRUPT_REG);
                NCR5380_write(INITIATOR_COMMAND_REG, ICR_ASSERT_DATA);
-               NCR5380_write(MODE_REG,
-                             (NCR5380_read(MODE_REG) | MR_DMA_MODE | MR_ENABLE_EOP_INTR));
                NCR5380_write(START_DMA_SEND_REG, 0);
        }
 
@@ -1864,8 +1764,6 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance,
        dregs->csr |= CSR_DMA_ENABLE;
 #endif
 
-       local_irq_restore(flags);
-
        sun3_dma_active = 1;
 
 #else /* !defined(CONFIG_SUN3) */
@@ -1880,25 +1778,20 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance,
        if (hostdata->read_overruns && (p & SR_IO))
                c -= hostdata->read_overruns;
 
-       dprintk(NDEBUG_DMA, "scsi%d: initializing DMA for %s, %d bytes %s %p\n",
-                  HOSTNO, (p & SR_IO) ? "reading" : "writing",
-                  c, (p & SR_IO) ? "to" : "from", d);
+       dsprintk(NDEBUG_DMA, instance, "initializing DMA %s: length %d, address %p\n",
+                (p & SR_IO) ? "receive" : "send", c, d);
 
        NCR5380_write(TARGET_COMMAND_REG, PHASE_SR_TO_TCR(p));
-
-#ifdef REAL_DMA
-       NCR5380_write(MODE_REG, MR_BASE | MR_DMA_MODE | MR_ENABLE_EOP_INTR | MR_MONITOR_BSY);
-#endif /* def REAL_DMA  */
+       NCR5380_write(MODE_REG, MR_BASE | MR_DMA_MODE | MR_MONITOR_BSY |
+                               MR_ENABLE_EOP_INTR);
 
        if (!(hostdata->flags & FLAG_LATE_DMA_SETUP)) {
                /* On the Medusa, it is a must to initialize the DMA before
                 * starting the NCR. This is also the cleaner way for the TT.
                 */
-               local_irq_save(flags);
                hostdata->dma_len = (p & SR_IO) ?
                        NCR5380_dma_read_setup(instance, d, c) :
                        NCR5380_dma_write_setup(instance, d, c);
-               local_irq_restore(flags);
        }
 
        if (p & SR_IO)
@@ -1912,11 +1805,9 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance,
                /* On the Falcon, the DMA setup must be done after the last */
                /* NCR access, else the DMA setup gets trashed!
                 */
-               local_irq_save(flags);
                hostdata->dma_len = (p & SR_IO) ?
                        NCR5380_dma_read_setup(instance, d, c) :
                        NCR5380_dma_write_setup(instance, d, c);
-               local_irq_restore(flags);
        }
 #endif /* !defined(CONFIG_SUN3) */
 
@@ -1928,23 +1819,22 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance,
  * Function : NCR5380_information_transfer (struct Scsi_Host *instance)
  *
  * Purpose : run through the various SCSI phases and do as the target
- *     directs us to.  Operates on the currently connected command,
- *     instance->connected.
+ * directs us to.  Operates on the currently connected command,
+ * instance->connected.
  *
  * Inputs : instance, instance for which we are doing commands
  *
  * Side effects : SCSI things happen, the disconnected queue will be
- *     modified if a command disconnects, *instance->connected will
- *     change.
+ * modified if a command disconnects, *instance->connected will
+ * change.
  *
  * XXX Note : we need to watch for bus free or a reset condition here
- *     to recover from an unexpected bus free condition.
+ * to recover from an unexpected bus free condition.
  */
 
 static void NCR5380_information_transfer(struct Scsi_Host *instance)
 {
-       SETUP_HOSTDATA(instance);
-       unsigned long flags;
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
        unsigned char msgout = NOP;
        int sink = 0;
        int len;
@@ -1953,13 +1843,15 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
 #endif
        unsigned char *data;
        unsigned char phase, tmp, extended_msg[10], old_phase = 0xff;
-       struct scsi_cmnd *cmd = (struct scsi_cmnd *) hostdata->connected;
+       struct scsi_cmnd *cmd;
 
 #ifdef SUN3_SCSI_VME
        dregs->csr |= CSR_INTR;
 #endif
 
-       while (1) {
+       while ((cmd = hostdata->connected)) {
+               struct NCR5380_cmd *ncmd = scsi_cmd_priv(cmd);
+
                tmp = NCR5380_read(STATUS_REG);
                /* We only have a valid SCSI phase when REQ is asserted */
                if (tmp & SR_REQ) {
@@ -1984,7 +1876,7 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
                                /* this command setup for dma yet? */
                                if ((count >= DMA_MIN_SIZE) && (sun3_dma_setup_done != cmd)) {
                                        if (cmd->request->cmd_type == REQ_TYPE_FS) {
-                                               sun3scsi_dma_setup(d, count,
+                                               sun3scsi_dma_setup(instance, d, count,
                                                                   rq_data_dir(cmd->request));
                                                sun3_dma_setup_done = cmd;
                                        }
@@ -2000,11 +1892,11 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
                                NCR5380_write(TARGET_COMMAND_REG, PHASE_SR_TO_TCR(tmp));
 
                                NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN |
-                                             ICR_ASSERT_ACK);
+                                             ICR_ASSERT_ACK);
                                while (NCR5380_read(STATUS_REG) & SR_REQ)
                                        ;
                                NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE |
-                                             ICR_ASSERT_ATN);
+                                             ICR_ASSERT_ATN);
                                sink = 0;
                                continue;
                        }
@@ -2012,12 +1904,11 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
                        switch (phase) {
                        case PHASE_DATAOUT:
 #if (NDEBUG & NDEBUG_NO_DATAOUT)
-                               printk("scsi%d: NDEBUG_NO_DATAOUT set, attempted DATAOUT "
-                                      "aborted\n", HOSTNO);
+                               shost_printk(KERN_DEBUG, instance, "NDEBUG_NO_DATAOUT set, attempted DATAOUT aborted\n");
                                sink = 1;
                                do_abort(instance);
                                cmd->result = DID_ERROR << 16;
-                               cmd->scsi_done(cmd);
+                               complete_cmd(instance, cmd);
                                return;
 #endif
                        case PHASE_DATAIN:
@@ -2031,13 +1922,10 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
                                        --cmd->SCp.buffers_residual;
                                        cmd->SCp.this_residual = cmd->SCp.buffer->length;
                                        cmd->SCp.ptr = sg_virt(cmd->SCp.buffer);
-                                       /* ++roman: Try to merge some scatter-buffers if
-                                        * they are at contiguous physical addresses.
-                                        */
                                        merge_contiguous_buffers(cmd);
-                                       dprintk(NDEBUG_INFORMATION, "scsi%d: %d bytes and %d buffers left\n",
-                                                  HOSTNO, cmd->SCp.this_residual,
-                                                  cmd->SCp.buffers_residual);
+                                       dsprintk(NDEBUG_INFORMATION, instance, "%d bytes and %d buffers left\n",
+                                                cmd->SCp.this_residual,
+                                                cmd->SCp.buffers_residual);
                                }
 
                                /*
@@ -2051,16 +1939,18 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
                                 */
 
                                /* ++roman: I suggest, this should be
-                                *   #if def(REAL_DMA)
+                                * #if def(REAL_DMA)
                                 * instead of leaving REAL_DMA out.
                                 */
 
 #if defined(REAL_DMA)
-                               if (
 #if !defined(CONFIG_SUN3)
-                                   !cmd->device->borken &&
+                               transfersize = 0;
+                               if (!cmd->device->borken)
 #endif
-                                   (transfersize = NCR5380_dma_xfer_len(instance, cmd, phase)) >= DMA_MIN_SIZE) {
+                                       transfersize = NCR5380_dma_xfer_len(instance, cmd, phase);
+
+                               if (transfersize >= DMA_MIN_SIZE) {
                                        len = transfersize;
                                        cmd->SCp.phase = phase;
                                        if (NCR5380_transfer_dma(instance, &phase,
@@ -2068,16 +1958,15 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
                                                /*
                                                 * If the watchdog timer fires, all future
                                                 * accesses to this device will use the
-                                                * polled-IO. */
+                                                * polled-IO.
+                                                */
                                                scmd_printk(KERN_INFO, cmd,
                                                        "switching to slow handshake\n");
                                                cmd->device->borken = 1;
-                                               NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE |
-                                                       ICR_ASSERT_ATN);
                                                sink = 1;
                                                do_abort(instance);
                                                cmd->result = DID_ERROR << 16;
-                                               cmd->scsi_done(cmd);
+                                               complete_cmd(instance, cmd);
                                                /* XXX - need to source or sink data here, as appropriate */
                                        } else {
 #ifdef REAL_DMA
@@ -2093,9 +1982,13 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
                                        }
                                } else
 #endif /* defined(REAL_DMA) */
+                               {
+                                       spin_unlock_irq(&hostdata->lock);
                                        NCR5380_transfer_pio(instance, &phase,
-                                                            (int *)&cmd->SCp.this_residual,
-                                                            (unsigned char **)&cmd->SCp.ptr);
+                                                            (int *)&cmd->SCp.this_residual,
+                                                            (unsigned char **)&cmd->SCp.ptr);
+                                       spin_lock_irq(&hostdata->lock);
+                               }
 #if defined(CONFIG_SUN3) && defined(REAL_DMA)
                                /* if we had intended to dma that command clear it */
                                if (sun3_dma_setup_done == cmd)
@@ -2105,162 +1998,64 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
                        case PHASE_MSGIN:
                                len = 1;
                                data = &tmp;
-                               NCR5380_write(SELECT_ENABLE_REG, 0);    /* disable reselects */
                                NCR5380_transfer_pio(instance, &phase, &len, &data);
                                cmd->SCp.Message = tmp;
 
                                switch (tmp) {
-                               /*
-                                * Linking lets us reduce the time required to get the
-                                * next command out to the device, hopefully this will
-                                * mean we don't waste another revolution due to the delays
-                                * required by ARBITRATION and another SELECTION.
-                                *
-                                * In the current implementation proposal, low level drivers
-                                * merely have to start the next command, pointed to by
-                                * next_link, done() is called as with unlinked commands.
-                                */
-#ifdef LINKED
-                               case LINKED_CMD_COMPLETE:
-                               case LINKED_FLG_CMD_COMPLETE:
-                                       /* Accept message by clearing ACK */
-                                       NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-
-                                       dprintk(NDEBUG_LINKED, "scsi%d: target %d lun %llu linked command "
-                                                  "complete.\n", HOSTNO, cmd->device->id, cmd->device->lun);
-
-                                       /* Enable reselect interrupts */
-                                       NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
-                                       /*
-                                        * Sanity check : A linked command should only terminate
-                                        * with one of these messages if there are more linked
-                                        * commands available.
-                                        */
-
-                                       if (!cmd->next_link) {
-                                                printk(KERN_NOTICE "scsi%d: target %d lun %llu "
-                                                       "linked command complete, no next_link\n",
-                                                       HOSTNO, cmd->device->id, cmd->device->lun);
-                                               sink = 1;
-                                               do_abort(instance);
-                                               return;
-                                       }
-
-                                       initialize_SCp(cmd->next_link);
-                                       /* The next command is still part of this process; copy it
-                                        * and don't free it! */
-                                       cmd->next_link->tag = cmd->tag;
-                                       cmd->result = cmd->SCp.Status | (cmd->SCp.Message << 8);
-                                       dprintk(NDEBUG_LINKED, "scsi%d: target %d lun %llu linked request "
-                                                  "done, calling scsi_done().\n",
-                                                  HOSTNO, cmd->device->id, cmd->device->lun);
-                                       cmd->scsi_done(cmd);
-                                       cmd = hostdata->connected;
-                                       break;
-#endif /* def LINKED */
                                case ABORT:
                                case COMMAND_COMPLETE:
                                        /* Accept message by clearing ACK */
                                        NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-                                       dprintk(NDEBUG_QUEUES, "scsi%d: command for target %d, lun %llu "
-                                                 "completed\n", HOSTNO, cmd->device->id, cmd->device->lun);
+                                       dsprintk(NDEBUG_QUEUES, instance,
+                                                "COMMAND COMPLETE %p target %d lun %llu\n",
+                                                cmd, scmd_id(cmd), cmd->device->lun);
 
-                                       local_irq_save(flags);
-                                       hostdata->retain_dma_intr++;
                                        hostdata->connected = NULL;
 #ifdef SUPPORT_TAGS
                                        cmd_free_tag(cmd);
                                        if (status_byte(cmd->SCp.Status) == QUEUE_FULL) {
-                                               /* Turn a QUEUE FULL status into BUSY, I think the
-                                                * mid level cannot handle QUEUE FULL :-( (The
-                                                * command is retried after BUSY). Also update our
-                                                * queue size to the number of currently issued
-                                                * commands now.
-                                                */
-                                               /* ++Andreas: the mid level code knows about
-                                                  QUEUE_FULL now. */
-                                               struct tag_alloc *ta = &hostdata->TagAlloc[scmd_id(cmd)][cmd->device->lun];
-                                               dprintk(NDEBUG_TAGS, "scsi%d: target %d lun %llu returned "
-                                                          "QUEUE_FULL after %d commands\n",
-                                                          HOSTNO, cmd->device->id, cmd->device->lun,
-                                                          ta->nr_allocated);
+                                               u8 lun = cmd->device->lun;
+                                               struct tag_alloc *ta = &hostdata->TagAlloc[scmd_id(cmd)][lun];
+
+                                               dsprintk(NDEBUG_TAGS, instance,
+                                                        "QUEUE_FULL %p target %d lun %d nr_allocated %d\n",
+                                                        cmd, scmd_id(cmd), lun, ta->nr_allocated);
                                                if (ta->queue_size > ta->nr_allocated)
-                                                       ta->nr_allocated = ta->queue_size;
+                                                       ta->queue_size = ta->nr_allocated;
                                        }
-#else
-                                       hostdata->busy[cmd->device->id] &= ~(1 << cmd->device->lun);
 #endif
-                                       /* Enable reselect interrupts */
-                                       NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
-
-                                       /*
-                                        * I'm not sure what the correct thing to do here is :
-                                        *
-                                        * If the command that just executed is NOT a request
-                                        * sense, the obvious thing to do is to set the result
-                                        * code to the values of the stored parameters.
-                                        *
-                                        * If it was a REQUEST SENSE command, we need some way to
-                                        * differentiate between the failure code of the original
-                                        * and the failure code of the REQUEST sense - the obvious
-                                        * case is success, where we fall through and leave the
-                                        * result code unchanged.
-                                        *
-                                        * The non-obvious place is where the REQUEST SENSE failed
-                                        */
-
-                                       if (cmd->cmnd[0] != REQUEST_SENSE)
-                                               cmd->result = cmd->SCp.Status | (cmd->SCp.Message << 8);
-                                       else if (status_byte(cmd->SCp.Status) != GOOD)
-                                               cmd->result = (cmd->result & 0x00ffff) | (DID_ERROR << 16);
-
-                                       if ((cmd->cmnd[0] == REQUEST_SENSE) &&
-                                               hostdata->ses.cmd_len) {
-                                               scsi_eh_restore_cmnd(cmd, &hostdata->ses);
-                                               hostdata->ses.cmd_len = 0 ;
-                                       }
-
-                                       if ((cmd->cmnd[0] != REQUEST_SENSE) &&
-                                           (status_byte(cmd->SCp.Status) == CHECK_CONDITION)) {
-                                               scsi_eh_prep_cmnd(cmd, &hostdata->ses, NULL, 0, ~0);
 
-                                               dprintk(NDEBUG_AUTOSENSE, "scsi%d: performing request sense\n", HOSTNO);
-
-                                               LIST(cmd,hostdata->issue_queue);
-                                               SET_NEXT(cmd, hostdata->issue_queue);
-                                               hostdata->issue_queue = (struct scsi_cmnd *) cmd;
-                                               dprintk(NDEBUG_QUEUES, "scsi%d: REQUEST SENSE added to head of "
-                                                         "issue queue\n", H_NO(cmd));
-                                       } else {
-                                               cmd->scsi_done(cmd);
+                                       cmd->result &= ~0xffff;
+                                       cmd->result |= cmd->SCp.Status;
+                                       cmd->result |= cmd->SCp.Message << 8;
+
+                                       if (cmd->cmnd[0] == REQUEST_SENSE)
+                                               complete_cmd(instance, cmd);
+                                       else {
+                                               if (cmd->SCp.Status == SAM_STAT_CHECK_CONDITION ||
+                                                   cmd->SCp.Status == SAM_STAT_COMMAND_TERMINATED) {
+                                                       dsprintk(NDEBUG_QUEUES, instance, "autosense: adding cmd %p to tail of autosense queue\n",
+                                                                cmd);
+                                                       list_add_tail(&ncmd->list,
+                                                                     &hostdata->autosense);
+                                               } else
+                                                       complete_cmd(instance, cmd);
                                        }
 
-                                       local_irq_restore(flags);
-
-                                       NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
                                        /*
                                         * Restore phase bits to 0 so an interrupted selection,
                                         * arbitration can resume.
                                         */
                                        NCR5380_write(TARGET_COMMAND_REG, 0);
 
-                                       while ((NCR5380_read(STATUS_REG) & SR_BSY) && !hostdata->connected)
-                                               barrier();
+                                       /* Enable reselect interrupts */
+                                       NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
 
-                                       local_irq_save(flags);
-                                       hostdata->retain_dma_intr--;
-                                       /* ++roman: For Falcon SCSI, release the lock on the
-                                        * ST-DMA here if no other commands are waiting on the
-                                        * disconnected queue.
-                                        */
                                        maybe_release_dma_irq(instance);
-                                       local_irq_restore(flags);
                                        return;
                                case MESSAGE_REJECT:
                                        /* Accept message by clearing ACK */
                                        NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-                                       /* Enable reselect interrupts */
-                                       NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
                                        switch (hostdata->last_message) {
                                        case HEAD_OF_QUEUE_TAG:
                                        case ORDERED_QUEUE_TAG:
@@ -2274,27 +2069,20 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
                                                cmd->device->tagged_supported = 0;
                                                hostdata->busy[cmd->device->id] |= (1 << cmd->device->lun);
                                                cmd->tag = TAG_NONE;
-                                               dprintk(NDEBUG_TAGS, "scsi%d: target %d lun %llu rejected "
-                                                          "QUEUE_TAG message; tagged queuing "
-                                                          "disabled\n",
-                                                          HOSTNO, cmd->device->id, cmd->device->lun);
+                                               dsprintk(NDEBUG_TAGS, instance, "target %d lun %llu rejected QUEUE_TAG message; tagged queuing disabled\n",
+                                                        scmd_id(cmd), cmd->device->lun);
                                                break;
                                        }
                                        break;
                                case DISCONNECT:
                                        /* Accept message by clearing ACK */
                                        NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-                                       local_irq_save(flags);
-                                       cmd->device->disconnect = 1;
-                                       LIST(cmd,hostdata->disconnected_queue);
-                                       SET_NEXT(cmd, hostdata->disconnected_queue);
                                        hostdata->connected = NULL;
-                                       hostdata->disconnected_queue = cmd;
-                                       local_irq_restore(flags);
-                                       dprintk(NDEBUG_QUEUES, "scsi%d: command for target %d lun %llu was "
-                                                 "moved from connected to the "
-                                                 "disconnected_queue\n", HOSTNO,
-                                                 cmd->device->id, cmd->device->lun);
+                                       list_add(&ncmd->list, &hostdata->disconnected);
+                                       dsprintk(NDEBUG_INFORMATION | NDEBUG_QUEUES,
+                                                instance, "connected command %p for target %d lun %llu moved to disconnected queue\n",
+                                                cmd, scmd_id(cmd), cmd->device->lun);
+
                                        /*
                                         * Restore phase bits to 0 so an interrupted selection,
                                         * arbitration can resume.
@@ -2303,9 +2091,6 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
 
                                        /* Enable reselect interrupts */
                                        NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
-                                       /* Wait for bus free to avoid nasty timeouts */
-                                       while ((NCR5380_read(STATUS_REG) & SR_BSY) && !hostdata->connected)
-                                               barrier();
 #ifdef SUN3_SCSI_VME
                                        dregs->csr |= CSR_DMA_ENABLE;
 #endif
@@ -2324,37 +2109,30 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
                                case RESTORE_POINTERS:
                                        /* Accept message by clearing ACK */
                                        NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-                                       /* Enable reselect interrupts */
-                                       NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
                                        break;
                                case EXTENDED_MESSAGE:
                                        /*
-                                        * Extended messages are sent in the following format :
-                                        * Byte
-                                        * 0            EXTENDED_MESSAGE == 1
-                                        * 1            length (includes one byte for code, doesn't
-                                        *              include first two bytes)
-                                        * 2            code
-                                        * 3..length+1  arguments
-                                        *
-                                        * Start the extended message buffer with the EXTENDED_MESSAGE
+                                        * Start the message buffer with the EXTENDED_MESSAGE
                                         * byte, since spi_print_msg() wants the whole thing.
                                         */
                                        extended_msg[0] = EXTENDED_MESSAGE;
                                        /* Accept first byte by clearing ACK */
                                        NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
 
-                                       dprintk(NDEBUG_EXTENDED, "scsi%d: receiving extended message\n", HOSTNO);
+                                       spin_unlock_irq(&hostdata->lock);
+
+                                       dsprintk(NDEBUG_EXTENDED, instance, "receiving extended message\n");
 
                                        len = 2;
                                        data = extended_msg + 1;
                                        phase = PHASE_MSGIN;
                                        NCR5380_transfer_pio(instance, &phase, &len, &data);
-                                       dprintk(NDEBUG_EXTENDED, "scsi%d: length=%d, code=0x%02x\n", HOSTNO,
-                                                  (int)extended_msg[1], (int)extended_msg[2]);
+                                       dsprintk(NDEBUG_EXTENDED, instance, "length %d, code 0x%02x\n",
+                                                (int)extended_msg[1],
+                                                (int)extended_msg[2]);
 
-                                       if (!len && extended_msg[1] <=
-                                           (sizeof(extended_msg) - 1)) {
+                                       if (!len && extended_msg[1] > 0 &&
+                                           extended_msg[1] <= sizeof(extended_msg) - 2) {
                                                /* Accept third byte by clearing ACK */
                                                NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
                                                len = extended_msg[1] - 1;
@@ -2362,8 +2140,8 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
                                                phase = PHASE_MSGIN;
 
                                                NCR5380_transfer_pio(instance, &phase, &len, &data);
-                                               dprintk(NDEBUG_EXTENDED, "scsi%d: message received, residual %d\n",
-                                                          HOSTNO, len);
+                                               dsprintk(NDEBUG_EXTENDED, instance, "message received, residual %d\n",
+                                                        len);
 
                                                switch (extended_msg[2]) {
                                                case EXTENDED_SDTR:
@@ -2373,15 +2151,18 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
                                                        tmp = 0;
                                                }
                                        } else if (len) {
-                                               printk(KERN_NOTICE "scsi%d: error receiving "
-                                                      "extended message\n", HOSTNO);
+                                               shost_printk(KERN_ERR, instance, "error receiving extended message\n");
                                                tmp = 0;
                                        } else {
-                                               printk(KERN_NOTICE "scsi%d: extended message "
-                                                          "code %02x length %d is too long\n",
-                                                          HOSTNO, extended_msg[2], extended_msg[1]);
+                                               shost_printk(KERN_NOTICE, instance, "extended message code %02x length %d is too long\n",
+                                                            extended_msg[2], extended_msg[1]);
                                                tmp = 0;
                                        }
+
+                                       spin_lock_irq(&hostdata->lock);
+                                       if (!hostdata->connected)
+                                               return;
+
                                        /* Fall through to reject message */
 
                                        /*
@@ -2390,8 +2171,7 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
                                         */
                                default:
                                        if (!tmp) {
-                                               printk(KERN_INFO "scsi%d: rejecting message ",
-                                                      instance->host_no);
+                                               shost_printk(KERN_ERR, instance, "rejecting message ");
                                                spi_print_msg(extended_msg);
                                                printk("\n");
                                        } else if (tmp != EXTENDED_MESSAGE)
@@ -2414,18 +2194,11 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
                                hostdata->last_message = msgout;
                                NCR5380_transfer_pio(instance, &phase, &len, &data);
                                if (msgout == ABORT) {
-                                       local_irq_save(flags);
-#ifdef SUPPORT_TAGS
-                                       cmd_free_tag(cmd);
-#else
-                                       hostdata->busy[cmd->device->id] &= ~(1 << cmd->device->lun);
-#endif
                                        hostdata->connected = NULL;
                                        cmd->result = DID_ERROR << 16;
-                                       NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
+                                       complete_cmd(instance, cmd);
                                        maybe_release_dma_irq(instance);
-                                       local_irq_restore(flags);
-                                       cmd->scsi_done(cmd);
+                                       NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask);
                                        return;
                                }
                                msgout = NOP;
@@ -2447,22 +2220,25 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
                                cmd->SCp.Status = tmp;
                                break;
                        default:
-                               printk("scsi%d: unknown phase\n", HOSTNO);
+                               shost_printk(KERN_ERR, instance, "unknown phase\n");
                                NCR5380_dprint(NDEBUG_ANY, instance);
                        } /* switch(phase) */
-               } /* if (tmp * SR_REQ) */
-       } /* while (1) */
+               } else {
+                       spin_unlock_irq(&hostdata->lock);
+                       NCR5380_poll_politely(instance, STATUS_REG, SR_REQ, SR_REQ, HZ);
+                       spin_lock_irq(&hostdata->lock);
+               }
+       }
 }
 
 /*
  * Function : void NCR5380_reselect (struct Scsi_Host *instance)
  *
  * Purpose : does reselection, initializing the instance->connected
- *     field to point to the scsi_cmnd for which the I_T_L or I_T_L_Q
- *     nexus has been reestablished,
+ * field to point to the scsi_cmnd for which the I_T_L or I_T_L_Q
+ * nexus has been reestablished,
  *
  * Inputs : instance - this instance of the NCR5380.
- *
  */
 
 
@@ -2471,7 +2247,7 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance)
 
 static void NCR5380_reselect(struct Scsi_Host *instance)
 {
-       SETUP_HOSTDATA(instance);
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
        unsigned char target_mask;
        unsigned char lun;
 #ifdef SUPPORT_TAGS
@@ -2480,7 +2256,8 @@ static void NCR5380_reselect(struct Scsi_Host *instance)
        unsigned char msg[3];
        int __maybe_unused len;
        unsigned char __maybe_unused *data, __maybe_unused phase;
-       struct scsi_cmnd *tmp = NULL, *prev;
+       struct NCR5380_cmd *ncmd;
+       struct scsi_cmnd *tmp;
 
        /*
         * Disable arbitration, etc. since the host adapter obviously
@@ -2488,11 +2265,10 @@ static void NCR5380_reselect(struct Scsi_Host *instance)
         */
 
        NCR5380_write(MODE_REG, MR_BASE);
-       hostdata->restart_select = 1;
 
        target_mask = NCR5380_read(CURRENT_SCSI_DATA_REG) & ~(hostdata->id_mask);
 
-       dprintk(NDEBUG_RESELECTION, "scsi%d: reselect\n", HOSTNO);
+       dsprintk(NDEBUG_RESELECTION, instance, "reselect\n");
 
        /*
         * At this point, we have detected that our SCSI ID is on the bus,
@@ -2504,17 +2280,22 @@ static void NCR5380_reselect(struct Scsi_Host *instance)
         */
 
        NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_BSY);
-
-       while (NCR5380_read(STATUS_REG) & SR_SEL)
-               ;
+       if (NCR5380_poll_politely(instance,
+                                 STATUS_REG, SR_SEL, 0, 2 * HZ) < 0) {
+               NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
+               return;
+       }
        NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
 
        /*
         * Wait for target to go into MSGIN.
         */
 
-       while (!(NCR5380_read(STATUS_REG) & SR_REQ))
-               ;
+       if (NCR5380_poll_politely(instance,
+                                 STATUS_REG, SR_REQ, SR_REQ, 2 * HZ) < 0) {
+               do_abort(instance);
+               return;
+       }
 
 #if defined(CONFIG_SUN3) && defined(REAL_DMA)
        /* acknowledge toggle to MSGIN */
@@ -2527,15 +2308,21 @@ static void NCR5380_reselect(struct Scsi_Host *instance)
        data = msg;
        phase = PHASE_MSGIN;
        NCR5380_transfer_pio(instance, &phase, &len, &data);
+
+       if (len) {
+               do_abort(instance);
+               return;
+       }
 #endif
 
        if (!(msg[0] & 0x80)) {
-               printk(KERN_DEBUG "scsi%d: expecting IDENTIFY message, got ", HOSTNO);
+               shost_printk(KERN_ERR, instance, "expecting IDENTIFY message, got ");
                spi_print_msg(msg);
+               printk("\n");
                do_abort(instance);
                return;
        }
-       lun = (msg[0] & 0x07);
+       lun = msg[0] & 0x07;
 
 #if defined(SUPPORT_TAGS) && !defined(CONFIG_SUN3)
        /* If the phase is still MSGIN, the target wants to send some more
@@ -2551,8 +2338,8 @@ static void NCR5380_reselect(struct Scsi_Host *instance)
                if (!NCR5380_transfer_pio(instance, &phase, &len, &data) &&
                    msg[1] == SIMPLE_QUEUE_TAG)
                        tag = msg[2];
-               dprintk(NDEBUG_TAGS, "scsi%d: target mask %02x, lun %d sent tag %d at "
-                          "reselection\n", HOSTNO, target_mask, lun, tag);
+               dsprintk(NDEBUG_TAGS, instance, "reselect: target mask %02x, lun %d sent tag %d\n",
+                        target_mask, lun, tag);
        }
 #endif
 
@@ -2561,36 +2348,34 @@ static void NCR5380_reselect(struct Scsi_Host *instance)
         * just reestablished, and remove it from the disconnected queue.
         */
 
-       for (tmp = (struct scsi_cmnd *) hostdata->disconnected_queue, prev = NULL;
-            tmp; prev = tmp, tmp = NEXT(tmp)) {
-               if ((target_mask == (1 << tmp->device->id)) && (lun == tmp->device->lun)
+       tmp = NULL;
+       list_for_each_entry(ncmd, &hostdata->disconnected, list) {
+               struct scsi_cmnd *cmd = NCR5380_to_scmd(ncmd);
+
+               if (target_mask == (1 << scmd_id(cmd)) &&
+                   lun == (u8)cmd->device->lun
 #ifdef SUPPORT_TAGS
-                   && (tag == tmp->tag)
+                   && (tag == cmd->tag)
 #endif
                    ) {
-                       if (prev) {
-                               REMOVE(prev, NEXT(prev), tmp, NEXT(tmp));
-                               SET_NEXT(prev, NEXT(tmp));
-                       } else {
-                               REMOVE(-1, hostdata->disconnected_queue, tmp, NEXT(tmp));
-                               hostdata->disconnected_queue = NEXT(tmp);
-                       }
-                       SET_NEXT(tmp, NULL);
+                       list_del(&ncmd->list);
+                       tmp = cmd;
                        break;
                }
        }
 
-       if (!tmp) {
-               printk(KERN_WARNING "scsi%d: warning: target bitmask %02x lun %d "
-#ifdef SUPPORT_TAGS
-                      "tag %d "
-#endif
-                      "not in disconnected_queue.\n",
-                      HOSTNO, target_mask, lun
+       if (tmp) {
+               dsprintk(NDEBUG_RESELECTION | NDEBUG_QUEUES, instance,
+                        "reselect: removed %p from disconnected queue\n", tmp);
+       } else {
+
 #ifdef SUPPORT_TAGS
-                      , tag
+               shost_printk(KERN_ERR, instance, "target bitmask 0x%02x lun %d tag %d not in disconnected queue.\n",
+                            target_mask, lun, tag);
+#else
+               shost_printk(KERN_ERR, instance, "target bitmask 0x%02x lun %d not in disconnected queue.\n",
+                            target_mask, lun);
 #endif
-                       );
                /*
                 * Since we have an established nexus that we can't do anything
                 * with, we must abort it.
@@ -2614,7 +2399,8 @@ static void NCR5380_reselect(struct Scsi_Host *instance)
                }
                /* setup this command for dma if not already */
                if ((count >= DMA_MIN_SIZE) && (sun3_dma_setup_done != tmp)) {
-                       sun3scsi_dma_setup(d, count, rq_data_dir(tmp->request));
+                       sun3scsi_dma_setup(instance, d, count,
+                                          rq_data_dir(tmp->request));
                        sun3_dma_setup_done = tmp;
                }
        }
@@ -2639,235 +2425,196 @@ static void NCR5380_reselect(struct Scsi_Host *instance)
                if (!NCR5380_transfer_pio(instance, &phase, &len, &data) &&
                    msg[1] == SIMPLE_QUEUE_TAG)
                        tag = msg[2];
-               dprintk(NDEBUG_TAGS, "scsi%d: target mask %02x, lun %d sent tag %d at reselection\n"
-                       HOSTNO, target_mask, lun, tag);
+               dsprintk(NDEBUG_TAGS, instance, "reselect: target mask %02x, lun %d sent tag %d\n"
+                        target_mask, lun, tag);
        }
 #endif
 
        hostdata->connected = tmp;
-       dprintk(NDEBUG_RESELECTION, "scsi%d: nexus established, target = %d, lun = %llu, tag = %d\n",
-                  HOSTNO, tmp->device->id, tmp->device->lun, tmp->tag);
+       dsprintk(NDEBUG_RESELECTION, instance, "nexus established, target %d, lun %llu, tag %d\n",
+                scmd_id(tmp), tmp->device->lun, tmp->tag);
 }
 
 
-/*
- * Function : int NCR5380_abort (struct scsi_cmnd *cmd)
- *
- * Purpose : abort a command
- *
- * Inputs : cmd - the scsi_cmnd to abort, code - code to set the
- *     host byte of the result field to, if zero DID_ABORTED is
- *     used.
- *
- * Returns : SUCCESS - success, FAILED on failure.
- *
- * XXX - there is no way to abort the command that is currently
- *      connected, you have to wait for it to complete.  If this is
- *      a problem, we could implement longjmp() / setjmp(), setjmp()
- *      called where the loop started in NCR5380_main().
+/**
+ * list_find_cmd - test for presence of a command in a linked list
+ * @haystack: list of commands
+ * @needle: command to search for
  */
 
-static
-int NCR5380_abort(struct scsi_cmnd *cmd)
+static bool list_find_cmd(struct list_head *haystack,
+                          struct scsi_cmnd *needle)
 {
-       struct Scsi_Host *instance = cmd->device->host;
-       SETUP_HOSTDATA(instance);
-       struct scsi_cmnd *tmp, **prev;
-       unsigned long flags;
+       struct NCR5380_cmd *ncmd;
 
-       scmd_printk(KERN_NOTICE, cmd, "aborting command\n");
+       list_for_each_entry(ncmd, haystack, list)
+               if (NCR5380_to_scmd(ncmd) == needle)
+                       return true;
+       return false;
+}
 
-       NCR5380_print_status(instance);
+/**
+ * list_remove_cmd - remove a command from linked list
+ * @haystack: list of commands
+ * @needle: command to remove
+ */
 
-       local_irq_save(flags);
+static bool list_del_cmd(struct list_head *haystack,
+                         struct scsi_cmnd *needle)
+{
+       if (list_find_cmd(haystack, needle)) {
+               struct NCR5380_cmd *ncmd = scsi_cmd_priv(needle);
 
-       dprintk(NDEBUG_ABORT, "scsi%d: abort called basr 0x%02x, sr 0x%02x\n", HOSTNO,
-                   NCR5380_read(BUS_AND_STATUS_REG),
-                   NCR5380_read(STATUS_REG));
+               list_del(&ncmd->list);
+               return true;
+       }
+       return false;
+}
 
-#if 1
-       /*
-        * Case 1 : If the command is the currently executing command,
-        * we'll set the aborted flag and return control so that
-        * information transfer routine can exit cleanly.
-        */
+/**
+ * NCR5380_abort - scsi host eh_abort_handler() method
+ * @cmd: the command to be aborted
+ *
+ * Try to abort a given command by removing it from queues and/or sending
+ * the target an abort message. This may not succeed in causing a target
+ * to abort the command. Nonetheless, the low-level driver must forget about
+ * the command because the mid-layer reclaims it and it may be re-issued.
+ *
+ * The normal path taken by a command is as follows. For EH we trace this
+ * same path to locate and abort the command.
+ *
+ * unissued -> selecting -> [unissued -> selecting ->]... connected ->
+ * [disconnected -> connected ->]...
+ * [autosense -> connected ->] done
+ *
+ * If cmd is unissued then just remove it.
+ * If cmd is disconnected, try to select the target.
+ * If cmd is connected, try to send an abort message.
+ * If cmd is waiting for autosense, give it a chance to complete but check
+ * that it isn't left connected.
+ * If cmd was not found at all then presumably it has already been completed,
+ * in which case return SUCCESS to try to avoid further EH measures.
+ * If the command has not completed yet, we must not fail to find it.
+ */
 
-       if (hostdata->connected == cmd) {
+static int NCR5380_abort(struct scsi_cmnd *cmd)
+{
+       struct Scsi_Host *instance = cmd->device->host;
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
+       unsigned long flags;
+       int result = SUCCESS;
 
-               dprintk(NDEBUG_ABORT, "scsi%d: aborting connected command\n", HOSTNO);
-               /*
-                * We should perform BSY checking, and make sure we haven't slipped
-                * into BUS FREE.
-                */
+       spin_lock_irqsave(&hostdata->lock, flags);
 
-               /*      NCR5380_write(INITIATOR_COMMAND_REG, ICR_ASSERT_ATN); */
-               /*
-                * Since we can't change phases until we've completed the current
-                * handshake, we have to source or sink a byte of data if the current
-                * phase is not MSGOUT.
-                */
+#if (NDEBUG & NDEBUG_ANY)
+       scmd_printk(KERN_INFO, cmd, __func__);
+#endif
+       NCR5380_dprint(NDEBUG_ANY, instance);
+       NCR5380_dprint_phase(NDEBUG_ANY, instance);
 
-               /*
-                * Return control to the executing NCR drive so we can clear the
-                * aborted flag and get back into our main loop.
-                */
+       if (list_del_cmd(&hostdata->unissued, cmd)) {
+               dsprintk(NDEBUG_ABORT, instance,
+                        "abort: removed %p from issue queue\n", cmd);
+               cmd->result = DID_ABORT << 16;
+               cmd->scsi_done(cmd); /* No tag or busy flag to worry about */
+       }
 
-               if (do_abort(instance) == 0) {
-                       hostdata->aborted = 1;
-                       hostdata->connected = NULL;
-                       cmd->result = DID_ABORT << 16;
-#ifdef SUPPORT_TAGS
-                       cmd_free_tag(cmd);
-#else
-                       hostdata->busy[cmd->device->id] &= ~(1 << cmd->device->lun);
-#endif
-                       maybe_release_dma_irq(instance);
-                       local_irq_restore(flags);
-                       cmd->scsi_done(cmd);
-                       return SUCCESS;
-               } else {
-                       local_irq_restore(flags);
-                       printk("scsi%d: abort of connected command failed!\n", HOSTNO);
-                       return FAILED;
-               }
+       if (hostdata->selecting == cmd) {
+               dsprintk(NDEBUG_ABORT, instance,
+                        "abort: cmd %p == selecting\n", cmd);
+               hostdata->selecting = NULL;
+               cmd->result = DID_ABORT << 16;
+               complete_cmd(instance, cmd);
+               goto out;
        }
-#endif
 
-       /*
-        * Case 2 : If the command hasn't been issued yet, we simply remove it
-        *          from the issue queue.
-        */
-       for (prev = (struct scsi_cmnd **)&(hostdata->issue_queue),
-            tmp = (struct scsi_cmnd *)hostdata->issue_queue;
-            tmp; prev = NEXTADDR(tmp), tmp = NEXT(tmp)) {
-               if (cmd == tmp) {
-                       REMOVE(5, *prev, tmp, NEXT(tmp));
-                       (*prev) = NEXT(tmp);
-                       SET_NEXT(tmp, NULL);
-                       tmp->result = DID_ABORT << 16;
-                       maybe_release_dma_irq(instance);
-                       local_irq_restore(flags);
-                       dprintk(NDEBUG_ABORT, "scsi%d: abort removed command from issue queue.\n",
-                                   HOSTNO);
-                       /* Tagged queuing note: no tag to free here, hasn't been assigned
-                        * yet... */
-                       tmp->scsi_done(tmp);
-                       return SUCCESS;
+       if (list_del_cmd(&hostdata->disconnected, cmd)) {
+               dsprintk(NDEBUG_ABORT, instance,
+                        "abort: removed %p from disconnected list\n", cmd);
+               cmd->result = DID_ERROR << 16;
+               if (!hostdata->connected)
+                       NCR5380_select(instance, cmd);
+               if (hostdata->connected != cmd) {
+                       complete_cmd(instance, cmd);
+                       result = FAILED;
+                       goto out;
                }
        }
 
-       /*
-        * Case 3 : If any commands are connected, we're going to fail the abort
-        *          and let the high level SCSI driver retry at a later time or
-        *          issue a reset.
-        *
-        *          Timeouts, and therefore aborted commands, will be highly unlikely
-        *          and handling them cleanly in this situation would make the common
-        *          case of noresets less efficient, and would pollute our code.  So,
-        *          we fail.
-        */
+       if (hostdata->connected == cmd) {
+               dsprintk(NDEBUG_ABORT, instance, "abort: cmd %p is connected\n", cmd);
+               hostdata->connected = NULL;
+               if (do_abort(instance)) {
+                       set_host_byte(cmd, DID_ERROR);
+                       complete_cmd(instance, cmd);
+                       result = FAILED;
+                       goto out;
+               }
+               set_host_byte(cmd, DID_ABORT);
+#ifdef REAL_DMA
+               hostdata->dma_len = 0;
+#endif
+               if (cmd->cmnd[0] == REQUEST_SENSE)
+                       complete_cmd(instance, cmd);
+               else {
+                       struct NCR5380_cmd *ncmd = scsi_cmd_priv(cmd);
 
-       if (hostdata->connected) {
-               local_irq_restore(flags);
-               dprintk(NDEBUG_ABORT, "scsi%d: abort failed, command connected.\n", HOSTNO);
-               return FAILED;
+                       /* Perform autosense for this command */
+                       list_add(&ncmd->list, &hostdata->autosense);
+               }
        }
 
-       /*
-        * Case 4: If the command is currently disconnected from the bus, and
-        *      there are no connected commands, we reconnect the I_T_L or
-        *      I_T_L_Q nexus associated with it, go into message out, and send
-        *      an abort message.
-        *
-        * This case is especially ugly. In order to reestablish the nexus, we
-        * need to call NCR5380_select().  The easiest way to implement this
-        * function was to abort if the bus was busy, and let the interrupt
-        * handler triggered on the SEL for reselect take care of lost arbitrations
-        * where necessary, meaning interrupts need to be enabled.
-        *
-        * When interrupts are enabled, the queues may change - so we
-        * can't remove it from the disconnected queue before selecting it
-        * because that could cause a failure in hashing the nexus if that
-        * device reselected.
-        *
-        * Since the queues may change, we can't use the pointers from when we
-        * first locate it.
-        *
-        * So, we must first locate the command, and if NCR5380_select()
-        * succeeds, then issue the abort, relocate the command and remove
-        * it from the disconnected queue.
-        */
-
-       for (tmp = (struct scsi_cmnd *) hostdata->disconnected_queue; tmp;
-            tmp = NEXT(tmp)) {
-               if (cmd == tmp) {
-                       local_irq_restore(flags);
-                       dprintk(NDEBUG_ABORT, "scsi%d: aborting disconnected command.\n", HOSTNO);
-
-                       if (NCR5380_select(instance, cmd))
-                               return FAILED;
-
-                       dprintk(NDEBUG_ABORT, "scsi%d: nexus reestablished.\n", HOSTNO);
-
-                       do_abort(instance);
-
-                       local_irq_save(flags);
-                       for (prev = (struct scsi_cmnd **)&(hostdata->disconnected_queue),
-                            tmp = (struct scsi_cmnd *)hostdata->disconnected_queue;
-                            tmp; prev = NEXTADDR(tmp), tmp = NEXT(tmp)) {
-                               if (cmd == tmp) {
-                                       REMOVE(5, *prev, tmp, NEXT(tmp));
-                                       *prev = NEXT(tmp);
-                                       SET_NEXT(tmp, NULL);
-                                       tmp->result = DID_ABORT << 16;
-                                       /* We must unlock the tag/LUN immediately here, since the
-                                        * target goes to BUS FREE and doesn't send us another
-                                        * message (COMMAND_COMPLETE or the like)
-                                        */
-#ifdef SUPPORT_TAGS
-                                       cmd_free_tag(tmp);
-#else
-                                       hostdata->busy[cmd->device->id] &= ~(1 << cmd->device->lun);
-#endif
-                                       maybe_release_dma_irq(instance);
-                                       local_irq_restore(flags);
-                                       tmp->scsi_done(tmp);
-                                       return SUCCESS;
-                               }
-                       }
+       if (list_find_cmd(&hostdata->autosense, cmd)) {
+               dsprintk(NDEBUG_ABORT, instance,
+                        "abort: found %p on sense queue\n", cmd);
+               spin_unlock_irqrestore(&hostdata->lock, flags);
+               queue_work(hostdata->work_q, &hostdata->main_task);
+               msleep(1000);
+               spin_lock_irqsave(&hostdata->lock, flags);
+               if (list_del_cmd(&hostdata->autosense, cmd)) {
+                       dsprintk(NDEBUG_ABORT, instance,
+                                "abort: removed %p from sense queue\n", cmd);
+                       set_host_byte(cmd, DID_ABORT);
+                       complete_cmd(instance, cmd);
+                       goto out;
                }
        }
 
-       /* Maybe it is sufficient just to release the ST-DMA lock... (if
-        * possible at all) At least, we should check if the lock could be
-        * released after the abort, in case it is kept due to some bug.
-        */
-       maybe_release_dma_irq(instance);
-       local_irq_restore(flags);
+       if (hostdata->connected == cmd) {
+               dsprintk(NDEBUG_ABORT, instance, "abort: cmd %p is connected\n", cmd);
+               hostdata->connected = NULL;
+               if (do_abort(instance)) {
+                       set_host_byte(cmd, DID_ERROR);
+                       complete_cmd(instance, cmd);
+                       result = FAILED;
+                       goto out;
+               }
+               set_host_byte(cmd, DID_ABORT);
+#ifdef REAL_DMA
+               hostdata->dma_len = 0;
+#endif
+               complete_cmd(instance, cmd);
+       }
 
-       /*
-        * Case 5 : If we reached this point, the command was not found in any of
-        *          the queues.
-        *
-        * We probably reached this point because of an unlikely race condition
-        * between the command completing successfully and the abortion code,
-        * so we won't panic, but we will notify the user in case something really
-        * broke.
-        */
+out:
+       if (result == FAILED)
+               dsprintk(NDEBUG_ABORT, instance, "abort: failed to abort %p\n", cmd);
+       else
+               dsprintk(NDEBUG_ABORT, instance, "abort: successfully aborted %p\n", cmd);
 
-       printk(KERN_INFO "scsi%d: warning : SCSI command probably completed successfully before abortion\n", HOSTNO);
+       queue_work(hostdata->work_q, &hostdata->main_task);
+       maybe_release_dma_irq(instance);
+       spin_unlock_irqrestore(&hostdata->lock, flags);
 
-       return FAILED;
+       return result;
 }
 
 
-/*
- * Function : int NCR5380_reset (struct scsi_cmnd *cmd)
- *
- * Purpose : reset the SCSI bus.
- *
- * Returns : SUCCESS or FAILURE
+/**
+ * NCR5380_bus_reset - reset the SCSI bus
+ * @cmd: SCSI command undergoing EH
  *
+ * Returns SUCCESS
  */
 
 static int NCR5380_bus_reset(struct scsi_cmnd *cmd)
@@ -2876,23 +2623,22 @@ static int NCR5380_bus_reset(struct scsi_cmnd *cmd)
        struct NCR5380_hostdata *hostdata = shost_priv(instance);
        int i;
        unsigned long flags;
+       struct NCR5380_cmd *ncmd;
 
-       NCR5380_print_status(instance);
+       spin_lock_irqsave(&hostdata->lock, flags);
+
+#if (NDEBUG & NDEBUG_ANY)
+       scmd_printk(KERN_INFO, cmd, __func__);
+#endif
+       NCR5380_dprint(NDEBUG_ANY, instance);
+       NCR5380_dprint_phase(NDEBUG_ANY, instance);
+
+       do_reset(instance);
 
-       /* get in phase */
-       NCR5380_write(TARGET_COMMAND_REG,
-                     PHASE_SR_TO_TCR(NCR5380_read(STATUS_REG)));
-       /* assert RST */
-       NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_RST);
-       udelay(40);
        /* reset NCR registers */
-       NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
        NCR5380_write(MODE_REG, MR_BASE);
        NCR5380_write(TARGET_COMMAND_REG, 0);
        NCR5380_write(SELECT_ENABLE_REG, 0);
-       /* ++roman: reset interrupt condition! otherwise no interrupts don't get
-        * through anymore ... */
-       (void)NCR5380_read(RESET_PARITY_INTERRUPT_REG);
 
        /* After the reset, there are no more connected or disconnected commands
         * and no busy units; so clear the low-level status here to avoid
@@ -2900,17 +2646,34 @@ static int NCR5380_bus_reset(struct scsi_cmnd *cmd)
         * commands!
         */
 
-       if (hostdata->issue_queue)
-               dprintk(NDEBUG_ABORT, "scsi%d: reset aborted issued command(s)\n", H_NO(cmd));
-       if (hostdata->connected)
-               dprintk(NDEBUG_ABORT, "scsi%d: reset aborted a connected command\n", H_NO(cmd));
-       if (hostdata->disconnected_queue)
-               dprintk(NDEBUG_ABORT, "scsi%d: reset aborted disconnected command(s)\n", H_NO(cmd));
+       hostdata->selecting = NULL;
+
+       list_for_each_entry(ncmd, &hostdata->disconnected, list) {
+               struct scsi_cmnd *cmd = NCR5380_to_scmd(ncmd);
+
+               set_host_byte(cmd, DID_RESET);
+               cmd->scsi_done(cmd);
+       }
+
+       list_for_each_entry(ncmd, &hostdata->autosense, list) {
+               struct scsi_cmnd *cmd = NCR5380_to_scmd(ncmd);
+
+               set_host_byte(cmd, DID_RESET);
+               cmd->scsi_done(cmd);
+       }
+
+       if (hostdata->connected) {
+               set_host_byte(hostdata->connected, DID_RESET);
+               complete_cmd(instance, hostdata->connected);
+               hostdata->connected = NULL;
+       }
+
+       if (hostdata->sensing) {
+               set_host_byte(hostdata->connected, DID_RESET);
+               complete_cmd(instance, hostdata->sensing);
+               hostdata->sensing = NULL;
+       }
 
-       local_irq_save(flags);
-       hostdata->issue_queue = NULL;
-       hostdata->connected = NULL;
-       hostdata->disconnected_queue = NULL;
 #ifdef SUPPORT_TAGS
        free_all_tags(hostdata);
 #endif
@@ -2920,8 +2683,9 @@ static int NCR5380_bus_reset(struct scsi_cmnd *cmd)
        hostdata->dma_len = 0;
 #endif
 
+       queue_work(hostdata->work_q, &hostdata->main_task);
        maybe_release_dma_irq(instance);
-       local_irq_restore(flags);
+       spin_unlock_irqrestore(&hostdata->lock, flags);
 
        return SUCCESS;
 }
index 5ede3daa93dc546c63a67809882f61009fb29582..78d1b2963f2c04b00ef0afbfb33be0560eef0361 100644 (file)
@@ -66,7 +66,6 @@
 
 #include <linux/module.h>
 #include <linux/types.h>
-#include <linux/delay.h>
 #include <linux/blkdev.h>
 #include <linux/interrupt.h>
 #include <linux/init.h>
@@ -98,7 +97,6 @@
 
 #define NCR5380_queue_command           atari_scsi_queue_command
 #define NCR5380_abort                   atari_scsi_abort
-#define NCR5380_show_info               atari_scsi_show_info
 #define NCR5380_info                    atari_scsi_info
 
 #define NCR5380_dma_read_setup(instance, data, count) \
@@ -161,23 +159,10 @@ static inline unsigned long SCSI_DMA_GETADR(void)
        return adr;
 }
 
-#define HOSTDATA_DMALEN                (((struct NCR5380_hostdata *) \
-                               (atari_scsi_host->hostdata))->dma_len)
-
-/* Time (in jiffies) to wait after a reset; the SCSI standard calls for 250ms,
- * we usually do 0.5s to be on the safe side. But Toshiba CD-ROMs once more
- * need ten times the standard value... */
-#ifndef CONFIG_ATARI_SCSI_TOSHIBA_DELAY
-#define        AFTER_RESET_DELAY       (HZ/2)
-#else
-#define        AFTER_RESET_DELAY       (5*HZ/2)
-#endif
-
 #ifdef REAL_DMA
 static void atari_scsi_fetch_restbytes(void);
 #endif
 
-static struct Scsi_Host *atari_scsi_host;
 static unsigned char (*atari_scsi_reg_read)(unsigned char reg);
 static void (*atari_scsi_reg_write)(unsigned char reg, unsigned char value);
 
@@ -208,12 +193,12 @@ static int setup_cmd_per_lun = -1;
 module_param(setup_cmd_per_lun, int, 0);
 static int setup_sg_tablesize = -1;
 module_param(setup_sg_tablesize, int, 0);
-#ifdef SUPPORT_TAGS
 static int setup_use_tagged_queuing = -1;
 module_param(setup_use_tagged_queuing, int, 0);
-#endif
 static int setup_hostid = -1;
 module_param(setup_hostid, int, 0);
+static int setup_toshiba_delay = -1;
+module_param(setup_toshiba_delay, int, 0);
 
 
 #if defined(REAL_DMA)
@@ -273,15 +258,17 @@ static void scsi_dma_buserr(int irq, void *dummy)
 #endif
 
 
-static irqreturn_t scsi_tt_intr(int irq, void *dummy)
+static irqreturn_t scsi_tt_intr(int irq, void *dev)
 {
 #ifdef REAL_DMA
+       struct Scsi_Host *instance = dev;
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
        int dma_stat;
 
        dma_stat = tt_scsi_dma.dma_ctrl;
 
-       dprintk(NDEBUG_INTR, "scsi%d: NCR5380 interrupt, DMA status = %02x\n",
-                  atari_scsi_host->host_no, dma_stat & 0xff);
+       dsprintk(NDEBUG_INTR, instance, "NCR5380 interrupt, DMA status = %02x\n",
+                dma_stat & 0xff);
 
        /* Look if it was the DMA that has interrupted: First possibility
         * is that a bus error occurred...
@@ -304,7 +291,8 @@ static irqreturn_t scsi_tt_intr(int irq, void *dummy)
         * data reg!
         */
        if ((dma_stat & 0x02) && !(dma_stat & 0x40)) {
-               atari_dma_residual = HOSTDATA_DMALEN - (SCSI_DMA_READ_P(dma_addr) - atari_dma_startaddr);
+               atari_dma_residual = hostdata->dma_len -
+                       (SCSI_DMA_READ_P(dma_addr) - atari_dma_startaddr);
 
                dprintk(NDEBUG_DMA, "SCSI DMA: There are %ld residual bytes.\n",
                           atari_dma_residual);
@@ -356,15 +344,17 @@ static irqreturn_t scsi_tt_intr(int irq, void *dummy)
 
 #endif /* REAL_DMA */
 
-       NCR5380_intr(irq, dummy);
+       NCR5380_intr(irq, dev);
 
        return IRQ_HANDLED;
 }
 
 
-static irqreturn_t scsi_falcon_intr(int irq, void *dummy)
+static irqreturn_t scsi_falcon_intr(int irq, void *dev)
 {
 #ifdef REAL_DMA
+       struct Scsi_Host *instance = dev;
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
        int dma_stat;
 
        /* Turn off DMA and select sector counter register before
@@ -399,7 +389,7 @@ static irqreturn_t scsi_falcon_intr(int irq, void *dummy)
                        printk(KERN_ERR "SCSI DMA error: %ld bytes lost in "
                               "ST-DMA fifo\n", transferred & 15);
 
-               atari_dma_residual = HOSTDATA_DMALEN - transferred;
+               atari_dma_residual = hostdata->dma_len - transferred;
                dprintk(NDEBUG_DMA, "SCSI DMA: There are %ld residual bytes.\n",
                           atari_dma_residual);
        } else
@@ -411,13 +401,14 @@ static irqreturn_t scsi_falcon_intr(int irq, void *dummy)
                 * data to the original destination address.
                 */
                memcpy(atari_dma_orig_addr, phys_to_virt(atari_dma_startaddr),
-                      HOSTDATA_DMALEN - atari_dma_residual);
+                      hostdata->dma_len - atari_dma_residual);
                atari_dma_orig_addr = NULL;
        }
 
 #endif /* REAL_DMA */
 
-       NCR5380_intr(irq, dummy);
+       NCR5380_intr(irq, dev);
+
        return IRQ_HANDLED;
 }
 
@@ -488,7 +479,7 @@ static int __init atari_scsi_setup(char *str)
         * Defaults depend on TT or Falcon, determined at run time.
         * Negative values mean don't change.
         */
-       int ints[6];
+       int ints[8];
 
        get_options(str, ARRAY_SIZE(ints), ints);
 
@@ -504,10 +495,11 @@ static int __init atari_scsi_setup(char *str)
                setup_sg_tablesize = ints[3];
        if (ints[0] >= 4)
                setup_hostid = ints[4];
-#ifdef SUPPORT_TAGS
        if (ints[0] >= 5)
                setup_use_tagged_queuing = ints[5];
-#endif
+       /* ints[6] (use_pdma) is ignored */
+       if (ints[0] >= 7)
+               setup_toshiba_delay = ints[7];
 
        return 1;
 }
@@ -516,38 +508,6 @@ __setup("atascsi=", atari_scsi_setup);
 #endif /* !MODULE */
 
 
-#ifdef CONFIG_ATARI_SCSI_RESET_BOOT
-static void __init atari_scsi_reset_boot(void)
-{
-       unsigned long end;
-
-       /*
-        * Do a SCSI reset to clean up the bus during initialization. No messing
-        * with the queues, interrupts, or locks necessary here.
-        */
-
-       printk("Atari SCSI: resetting the SCSI bus...");
-
-       /* get in phase */
-       NCR5380_write(TARGET_COMMAND_REG,
-                     PHASE_SR_TO_TCR(NCR5380_read(STATUS_REG)));
-
-       /* assert RST */
-       NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_RST);
-       /* The min. reset hold time is 25us, so 40us should be enough */
-       udelay(50);
-       /* reset RST and interrupt */
-       NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE);
-       NCR5380_read(RESET_PARITY_INTERRUPT_REG);
-
-       end = jiffies + AFTER_RESET_DELAY;
-       while (time_before(jiffies, end))
-               barrier();
-
-       printk(" done\n");
-}
-#endif
-
 #if defined(REAL_DMA)
 
 static unsigned long atari_scsi_dma_setup(struct Scsi_Host *instance,
@@ -815,14 +775,14 @@ static int atari_scsi_bus_reset(struct scsi_cmnd *cmd)
 static struct scsi_host_template atari_scsi_template = {
        .module                 = THIS_MODULE,
        .proc_name              = DRV_MODULE_NAME,
-       .show_info              = atari_scsi_show_info,
        .name                   = "Atari native SCSI",
        .info                   = atari_scsi_info,
        .queuecommand           = atari_scsi_queue_command,
        .eh_abort_handler       = atari_scsi_abort,
        .eh_bus_reset_handler   = atari_scsi_bus_reset,
        .this_id                = 7,
-       .use_clustering         = DISABLE_CLUSTERING
+       .use_clustering         = DISABLE_CLUSTERING,
+       .cmd_size               = NCR5380_CMD_SIZE,
 };
 
 static int __init atari_scsi_probe(struct platform_device *pdev)
@@ -880,7 +840,7 @@ static int __init atari_scsi_probe(struct platform_device *pdev)
        } else {
                /* Test if a host id is set in the NVRam */
                if (ATARIHW_PRESENT(TT_CLK) && nvram_check_checksum()) {
-                       unsigned char b = nvram_read_byte(14);
+                       unsigned char b = nvram_read_byte(16);
 
                        /* Arbitration enabled? (for TOS)
                         * If yes, use configured host ID
@@ -915,21 +875,18 @@ static int __init atari_scsi_probe(struct platform_device *pdev)
                error = -ENOMEM;
                goto fail_alloc;
        }
-       atari_scsi_host = instance;
-
-#ifdef CONFIG_ATARI_SCSI_RESET_BOOT
-       atari_scsi_reset_boot();
-#endif
 
        instance->irq = irq->start;
 
        host_flags |= IS_A_TT() ? 0 : FLAG_LATE_DMA_SETUP;
-
 #ifdef SUPPORT_TAGS
        host_flags |= setup_use_tagged_queuing > 0 ? FLAG_TAGGED_QUEUING : 0;
 #endif
+       host_flags |= setup_toshiba_delay > 0 ? FLAG_TOSHIBA_DELAY : 0;
 
-       NCR5380_init(instance, host_flags);
+       error = NCR5380_init(instance, host_flags);
+       if (error)
+               goto fail_init;
 
        if (IS_A_TT()) {
                error = request_irq(instance->irq, scsi_tt_intr, 0,
@@ -975,6 +932,8 @@ static int __init atari_scsi_probe(struct platform_device *pdev)
 #endif
        }
 
+       NCR5380_maybe_reset_bus(instance);
+
        error = scsi_add_host(instance, NULL);
        if (error)
                goto fail_host;
@@ -989,6 +948,7 @@ fail_host:
                free_irq(instance->irq, instance);
 fail_irq:
        NCR5380_exit(instance);
+fail_init:
        scsi_host_put(instance);
 fail_alloc:
        if (atari_dma_buffer)
index 4e7cad27246944f6e79cf21a56fb3f25f96e1234..bad5f32e1f670587ddedda3ec97104e1c2d4555a 100644 (file)
@@ -3,6 +3,7 @@ config BE2ISCSI
        depends on PCI && SCSI && NET
        select SCSI_ISCSI_ATTRS
        select ISCSI_BOOT_SYSFS
+       select IRQ_POLL
 
        help
        This driver implements the iSCSI functionality for Emulex
index 77f992e7472629eeeaedf3470ced43d02077d9ca..a41c6432f4446cf9082916a0859e08be3c0a70c7 100644 (file)
@@ -20,7 +20,7 @@
 
 #include <linux/pci.h>
 #include <linux/if_vlan.h>
-#include <linux/blk-iopoll.h>
+#include <linux/irq_poll.h>
 #define FW_VER_LEN     32
 #define MCC_Q_LEN      128
 #define MCC_CQ_LEN     256
@@ -101,7 +101,7 @@ struct be_eq_obj {
        struct beiscsi_hba *phba;
        struct be_queue_info *cq;
        struct work_struct work_cqs; /* Work Item */
-       struct blk_iopoll       iopoll;
+       struct irq_poll iopoll;
 };
 
 struct be_mcc_obj {
index b7087ba69d8df1c2daa9ac7b1b8a9ab314257a8b..022e87b62e401a37e1d6578e065f389ce560a8b4 100644 (file)
@@ -1292,9 +1292,9 @@ static void beiscsi_flush_cq(struct beiscsi_hba *phba)
 
        for (i = 0; i < phba->num_cpus; i++) {
                pbe_eq = &phwi_context->be_eq[i];
-               blk_iopoll_disable(&pbe_eq->iopoll);
+               irq_poll_disable(&pbe_eq->iopoll);
                beiscsi_process_cq(pbe_eq);
-               blk_iopoll_enable(&pbe_eq->iopoll);
+               irq_poll_enable(&pbe_eq->iopoll);
        }
 }
 
index fe0c5143f8e68a6cb3e9618c98258c9e4118e1ae..cb9072a841be19cbfde9ff655fec50c523edf8b3 100644 (file)
@@ -910,8 +910,7 @@ static irqreturn_t be_isr_msix(int irq, void *dev_id)
        num_eq_processed = 0;
        while (eqe->dw[offsetof(struct amap_eq_entry, valid) / 32]
                                & EQE_VALID_MASK) {
-               if (!blk_iopoll_sched_prep(&pbe_eq->iopoll))
-                       blk_iopoll_sched(&pbe_eq->iopoll);
+               irq_poll_sched(&pbe_eq->iopoll);
 
                AMAP_SET_BITS(struct amap_eq_entry, valid, eqe, 0);
                queue_tail_inc(eq);
@@ -972,8 +971,7 @@ static irqreturn_t be_isr(int irq, void *dev_id)
                        spin_unlock_irqrestore(&phba->isr_lock, flags);
                        num_mcceq_processed++;
                } else {
-                       if (!blk_iopoll_sched_prep(&pbe_eq->iopoll))
-                               blk_iopoll_sched(&pbe_eq->iopoll);
+                       irq_poll_sched(&pbe_eq->iopoll);
                        num_ioeq_processed++;
                }
                AMAP_SET_BITS(struct amap_eq_entry, valid, eqe, 0);
@@ -2295,7 +2293,7 @@ void beiscsi_process_all_cqs(struct work_struct *work)
        hwi_ring_eq_db(phba, pbe_eq->q.id, 0, 0, 1, 1);
 }
 
-static int be_iopoll(struct blk_iopoll *iop, int budget)
+static int be_iopoll(struct irq_poll *iop, int budget)
 {
        unsigned int ret;
        struct beiscsi_hba *phba;
@@ -2306,7 +2304,7 @@ static int be_iopoll(struct blk_iopoll *iop, int budget)
        pbe_eq->cq_count += ret;
        if (ret < budget) {
                phba = pbe_eq->phba;
-               blk_iopoll_complete(iop);
+               irq_poll_complete(iop);
                beiscsi_log(phba, KERN_INFO,
                            BEISCSI_LOG_CONFIG | BEISCSI_LOG_IO,
                            "BM_%d : rearm pbe_eq->q.id =%d\n",
@@ -5293,7 +5291,7 @@ static void beiscsi_quiesce(struct beiscsi_hba *phba,
 
        for (i = 0; i < phba->num_cpus; i++) {
                pbe_eq = &phwi_context->be_eq[i];
-               blk_iopoll_disable(&pbe_eq->iopoll);
+               irq_poll_disable(&pbe_eq->iopoll);
        }
 
        if (unload_state == BEISCSI_CLEAN_UNLOAD) {
@@ -5579,9 +5577,8 @@ static void beiscsi_eeh_resume(struct pci_dev *pdev)
 
        for (i = 0; i < phba->num_cpus; i++) {
                pbe_eq = &phwi_context->be_eq[i];
-               blk_iopoll_init(&pbe_eq->iopoll, be_iopoll_budget,
+               irq_poll_init(&pbe_eq->iopoll, be_iopoll_budget,
                                be_iopoll);
-               blk_iopoll_enable(&pbe_eq->iopoll);
        }
 
        i = (phba->msix_enabled) ? i : 0;
@@ -5752,9 +5749,8 @@ static int beiscsi_dev_probe(struct pci_dev *pcidev,
 
        for (i = 0; i < phba->num_cpus; i++) {
                pbe_eq = &phwi_context->be_eq[i];
-               blk_iopoll_init(&pbe_eq->iopoll, be_iopoll_budget,
+               irq_poll_init(&pbe_eq->iopoll, be_iopoll_budget,
                                be_iopoll);
-               blk_iopoll_enable(&pbe_eq->iopoll);
        }
 
        i = (phba->msix_enabled) ? i : 0;
@@ -5795,7 +5791,7 @@ free_blkenbld:
        destroy_workqueue(phba->wq);
        for (i = 0; i < phba->num_cpus; i++) {
                pbe_eq = &phwi_context->be_eq[i];
-               blk_iopoll_disable(&pbe_eq->iopoll);
+               irq_poll_disable(&pbe_eq->iopoll);
        }
 free_twq:
        beiscsi_clean_port(phba);
index 0e2bee937fe81b345abee5ca20beec65f098f175..e22a268fd311b9a859166c57a32b3cf499fd6c26 100644 (file)
@@ -57,7 +57,7 @@ MODULE_PARM_DESC(cxgb3i_snd_win, "TCP send window in bytes (default=128KB)");
 
 static int cxgb3i_rx_credit_thres = 10 * 1024;
 module_param(cxgb3i_rx_credit_thres, int, 0644);
-MODULE_PARM_DESC(rx_credit_thres,
+MODULE_PARM_DESC(cxgb3i_rx_credit_thres,
                 "RX credits return threshold in bytes (default=10KB)");
 
 static unsigned int cxgb3i_max_connect = 8 * 1024;
index 3e088125a8be6a339757dc1380f1c73418952bfc..6c14e68b9e1a80aab149ed3f7556ce3e22de7280 100644 (file)
 
 #define DONT_USE_INTR
 
-#define NCR5380_read(reg)              inb(port + reg)
-#define NCR5380_write(reg, value)      outb(value, port + reg)
+#define NCR5380_read(reg)              inb(instance->io_port + reg)
+#define NCR5380_write(reg, value)      outb(value, instance->io_port + reg)
 
 #define NCR5380_implementation_fields  /* none */
-#define NCR5380_local_declare()                unsigned int port
-#define NCR5380_setup(instance)                port = instance->io_port
-
-/*
- * Includes needed for NCR5380.[ch] (XXX: Move them to NCR5380.h)
- */
-#include <linux/delay.h>
 
 #include "NCR5380.h"
 #include "NCR5380.c"
@@ -56,6 +49,7 @@
 
 
 static struct scsi_host_template dmx3191d_driver_template = {
+       .module                 = THIS_MODULE,
        .proc_name              = DMX3191D_DRIVER_NAME,
        .name                   = "Domex DMX3191D",
        .info                   = NCR5380_info,
@@ -67,6 +61,8 @@ static struct scsi_host_template dmx3191d_driver_template = {
        .sg_tablesize           = SG_ALL,
        .cmd_per_lun            = 2,
        .use_clustering         = DISABLE_CLUSTERING,
+       .cmd_size               = NCR5380_CMD_SIZE,
+       .max_sectors            = 128,
 };
 
 static int dmx3191d_probe_one(struct pci_dev *pdev,
@@ -97,17 +93,25 @@ static int dmx3191d_probe_one(struct pci_dev *pdev,
         */
        shost->irq = NO_IRQ;
 
-       NCR5380_init(shost, FLAG_NO_PSEUDO_DMA | FLAG_DTC3181E);
+       error = NCR5380_init(shost, FLAG_NO_PSEUDO_DMA);
+       if (error)
+               goto out_host_put;
+
+       NCR5380_maybe_reset_bus(shost);
 
        pci_set_drvdata(pdev, shost);
 
        error = scsi_add_host(shost, &pdev->dev);
        if (error)
-               goto out_release_region;
+               goto out_exit;
 
        scsi_scan_host(shost);
        return 0;
 
+out_exit:
+       NCR5380_exit(shost);
+out_host_put:
+       scsi_host_put(shost);
  out_release_region:
        release_region(io, DMX3191D_REGION_LEN);
  out_disable_device:
@@ -119,15 +123,14 @@ static int dmx3191d_probe_one(struct pci_dev *pdev,
 static void dmx3191d_remove_one(struct pci_dev *pdev)
 {
        struct Scsi_Host *shost = pci_get_drvdata(pdev);
+       unsigned long io = shost->io_port;
 
        scsi_remove_host(shost);
 
        NCR5380_exit(shost);
-
-       release_region(shost->io_port, DMX3191D_REGION_LEN);
-       pci_disable_device(pdev);
-
        scsi_host_put(shost);
+       release_region(io, DMX3191D_REGION_LEN);
+       pci_disable_device(pdev);
 }
 
 static struct pci_device_id dmx3191d_pci_tbl[] = {
index 4c74c7ba2dff7cf6829684176c484ecbf356cea1..6c736b071cf4bcbaf8f5fb68b19192dc039dd293 100644 (file)
@@ -1,9 +1,5 @@
-
 #define PSEUDO_DMA
 #define DONT_USE_INTR
-#define UNSAFE                 /* Leave interrupts enabled during pseudo-dma I/O */
-#define DMA_WORKS_RIGHT
-
 
 /*
  * DTC 3180/3280 driver, by
 
 
 #include <linux/module.h>
-#include <linux/signal.h>
 #include <linux/blkdev.h>
-#include <linux/delay.h>
-#include <linux/stat.h>
 #include <linux/string.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <scsi/scsi_host.h>
+
 #include "dtc.h"
 #define AUTOPROBE_IRQ
 #include "NCR5380.h"
@@ -150,7 +144,7 @@ static const struct signature {
 
 static int __init dtc_setup(char *str)
 {
-       static int commandline_current = 0;
+       static int commandline_current;
        int i;
        int ints[10];
 
@@ -188,7 +182,7 @@ __setup("dtc=", dtc_setup);
 
 static int __init dtc_detect(struct scsi_host_template * tpnt)
 {
-       static int current_override = 0, current_base = 0;
+       static int current_override, current_base;
        struct Scsi_Host *instance;
        unsigned int addr;
        void __iomem *base;
@@ -205,9 +199,8 @@ static int __init dtc_detect(struct scsi_host_template * tpnt)
                                addr = 0;
                } else
                        for (; !addr && (current_base < NO_BASES); ++current_base) {
-#if (DTCDEBUG & DTCDEBUG_INIT)
-                               printk(KERN_DEBUG "scsi-dtc : probing address %08x\n", bases[current_base].address);
-#endif
+                               dprintk(NDEBUG_INIT, "dtc: probing address 0x%08x\n",
+                                       (unsigned int)bases[current_base].address);
                                if (bases[current_base].noauto)
                                        continue;
                                base = ioremap(bases[current_base].address, 0x2000);
@@ -216,18 +209,14 @@ static int __init dtc_detect(struct scsi_host_template * tpnt)
                                for (sig = 0; sig < NO_SIGNATURES; ++sig) {
                                        if (check_signature(base + signatures[sig].offset, signatures[sig].string, strlen(signatures[sig].string))) {
                                                addr = bases[current_base].address;
-#if (DTCDEBUG & DTCDEBUG_INIT)
-                                               printk(KERN_DEBUG "scsi-dtc : detected board.\n");
-#endif
+                                               dprintk(NDEBUG_INIT, "dtc: detected board\n");
                                                goto found;
                                        }
                                }
                                iounmap(base);
                        }
 
-#if defined(DTCDEBUG) && (DTCDEBUG & DTCDEBUG_INIT)
-               printk(KERN_DEBUG "scsi-dtc : base = %08x\n", addr);
-#endif
+               dprintk(NDEBUG_INIT, "dtc: addr = 0x%08x\n", addr);
 
                if (!addr)
                        break;
@@ -235,12 +224,15 @@ static int __init dtc_detect(struct scsi_host_template * tpnt)
 found:
                instance = scsi_register(tpnt, sizeof(struct NCR5380_hostdata));
                if (instance == NULL)
-                       break;
+                       goto out_unmap;
 
                instance->base = addr;
                ((struct NCR5380_hostdata *)(instance)->hostdata)->base = base;
 
-               NCR5380_init(instance, 0);
+               if (NCR5380_init(instance, FLAG_NO_DMA_FIXUP))
+                       goto out_unregister;
+
+               NCR5380_maybe_reset_bus(instance);
 
                NCR5380_write(DTC_CONTROL_REG, CSR_5380_INTR);  /* Enable int's */
                if (overrides[current_override].irq != IRQ_AUTO)
@@ -271,14 +263,19 @@ found:
                        printk(KERN_WARNING "scsi%d : interrupts not used. Might as well not jumper it.\n", instance->host_no);
                instance->irq = NO_IRQ;
 #endif
-#if defined(DTCDEBUG) && (DTCDEBUG & DTCDEBUG_INIT)
-               printk("scsi%d : irq = %d\n", instance->host_no, instance->irq);
-#endif
+               dprintk(NDEBUG_INIT, "scsi%d : irq = %d\n",
+                       instance->host_no, instance->irq);
 
                ++current_override;
                ++count;
        }
        return count;
+
+out_unregister:
+       scsi_unregister(instance);
+out_unmap:
+       iounmap(base);
+       return count;
 }
 
 /*
@@ -331,12 +328,8 @@ static inline int NCR5380_pread(struct Scsi_Host *instance, unsigned char *dst,
        unsigned char *d = dst;
        int i;                  /* For counting time spent in the poll-loop */
        struct NCR5380_hostdata *hostdata = shost_priv(instance);
-       NCR5380_local_declare();
-       NCR5380_setup(instance);
 
        i = 0;
-       NCR5380_read(RESET_PARITY_INTERRUPT_REG);
-       NCR5380_write(MODE_REG, MR_ENABLE_EOP_INTR | MR_DMA_MODE);
        if (instance->irq == NO_IRQ)
                NCR5380_write(DTC_CONTROL_REG, CSR_DIR_READ);
        else
@@ -348,7 +341,7 @@ static inline int NCR5380_pread(struct Scsi_Host *instance, unsigned char *dst,
                while (NCR5380_read(DTC_CONTROL_REG) & CSR_HOST_BUF_NOT_RDY)
                        ++i;
                rtrc(3);
-               memcpy_fromio(d, base + DTC_DATA_BUF, 128);
+               memcpy_fromio(d, hostdata->base + DTC_DATA_BUF, 128);
                d += 128;
                len -= 128;
                rtrc(7);
@@ -358,9 +351,7 @@ static inline int NCR5380_pread(struct Scsi_Host *instance, unsigned char *dst,
        rtrc(4);
        while (!(NCR5380_read(DTC_CONTROL_REG) & D_CR_ACCESS))
                ++i;
-       NCR5380_write(MODE_REG, 0);     /* Clear the operating mode */
        rtrc(0);
-       NCR5380_read(RESET_PARITY_INTERRUPT_REG);
        if (i > hostdata->spin_max_r)
                hostdata->spin_max_r = i;
        return (0);
@@ -383,12 +374,7 @@ static inline int NCR5380_pwrite(struct Scsi_Host *instance, unsigned char *src,
 {
        int i;
        struct NCR5380_hostdata *hostdata = shost_priv(instance);
-       NCR5380_local_declare();
-       NCR5380_setup(instance);
 
-       NCR5380_read(RESET_PARITY_INTERRUPT_REG);
-       NCR5380_write(MODE_REG, MR_ENABLE_EOP_INTR | MR_DMA_MODE);
-       /* set direction (write) */
        if (instance->irq == NO_IRQ)
                NCR5380_write(DTC_CONTROL_REG, 0);
        else
@@ -400,7 +386,7 @@ static inline int NCR5380_pwrite(struct Scsi_Host *instance, unsigned char *src,
                while (NCR5380_read(DTC_CONTROL_REG) & CSR_HOST_BUF_NOT_RDY)
                        ++i;
                rtrc(3);
-               memcpy_toio(base + DTC_DATA_BUF, src, 128);
+               memcpy_toio(hostdata->base + DTC_DATA_BUF, src, 128);
                src += 128;
                len -= 128;
        }
@@ -413,47 +399,60 @@ static inline int NCR5380_pwrite(struct Scsi_Host *instance, unsigned char *src,
                ++i;
        rtrc(7);
        /* Check for parity error here. fixme. */
-       NCR5380_write(MODE_REG, 0);     /* Clear the operating mode */
        rtrc(0);
        if (i > hostdata->spin_max_w)
                hostdata->spin_max_w = i;
        return (0);
 }
 
+static int dtc_dma_xfer_len(struct scsi_cmnd *cmd)
+{
+       int transfersize = cmd->transfersize;
+
+       /* Limit transfers to 32K, for xx400 & xx406
+        * pseudoDMA that transfers in 128 bytes blocks.
+        */
+       if (transfersize > 32 * 1024 && cmd->SCp.this_residual &&
+           !(cmd->SCp.this_residual % transfersize))
+               transfersize = 32 * 1024;
+
+       return transfersize;
+}
+
 MODULE_LICENSE("GPL");
 
 #include "NCR5380.c"
 
 static int dtc_release(struct Scsi_Host *shost)
 {
-       NCR5380_local_declare();
-       NCR5380_setup(shost);
+       struct NCR5380_hostdata *hostdata = shost_priv(shost);
+
        if (shost->irq != NO_IRQ)
                free_irq(shost->irq, shost);
        NCR5380_exit(shost);
-       if (shost->io_port && shost->n_io_port)
-               release_region(shost->io_port, shost->n_io_port);
        scsi_unregister(shost);
-       iounmap(base);
+       iounmap(hostdata->base);
        return 0;
 }
 
 static struct scsi_host_template driver_template = {
-       .name                           = "DTC 3180/3280 ",
-       .detect                         = dtc_detect,
-       .release                        = dtc_release,
-       .proc_name                      = "dtc3x80",
-       .show_info                      = dtc_show_info,
-       .write_info                     = dtc_write_info,
-       .info                           = dtc_info,
-       .queuecommand                   = dtc_queue_command,
-       .eh_abort_handler               = dtc_abort,
-       .eh_bus_reset_handler           = dtc_bus_reset,
-       .bios_param                     = dtc_biosparam,
-       .can_queue                      = CAN_QUEUE,
-       .this_id                        = 7,
-       .sg_tablesize                   = SG_ALL,
-       .cmd_per_lun                    = CMD_PER_LUN,
-       .use_clustering                 = DISABLE_CLUSTERING,
+       .name                   = "DTC 3180/3280",
+       .detect                 = dtc_detect,
+       .release                = dtc_release,
+       .proc_name              = "dtc3x80",
+       .show_info              = dtc_show_info,
+       .write_info             = dtc_write_info,
+       .info                   = dtc_info,
+       .queuecommand           = dtc_queue_command,
+       .eh_abort_handler       = dtc_abort,
+       .eh_bus_reset_handler   = dtc_bus_reset,
+       .bios_param             = dtc_biosparam,
+       .can_queue              = 32,
+       .this_id                = 7,
+       .sg_tablesize           = SG_ALL,
+       .cmd_per_lun            = 2,
+       .use_clustering         = DISABLE_CLUSTERING,
+       .cmd_size               = NCR5380_CMD_SIZE,
+       .max_sectors            = 128,
 };
 #include "scsi_module.c"
index 78a2332e9064ee6b58372943ec0c327d2a74b26f..56732cba8abad0110f04532239d62fa1b8163c91 100644 (file)
 #ifndef DTC3280_H
 #define DTC3280_H
 
-#define DTCDEBUG 0
-#define DTCDEBUG_INIT  0x1
-#define DTCDEBUG_TRANSFER 0x2
-
-#ifndef CMD_PER_LUN
-#define CMD_PER_LUN 2
-#endif
-
-#ifndef CAN_QUEUE
-#define CAN_QUEUE 32 
-#endif
-
 #define NCR5380_implementation_fields \
     void __iomem *base
 
-#define NCR5380_local_declare() \
-    void __iomem *base
-
-#define NCR5380_setup(instance) \
-    base = ((struct NCR5380_hostdata *)(instance)->hostdata)->base
+#define DTC_address(reg) \
+       (((struct NCR5380_hostdata *)shost_priv(instance))->base + DTC_5380_OFFSET + reg)
 
-#define DTC_address(reg) (base + DTC_5380_OFFSET + reg)
-
-#define dbNCR5380_read(reg)                                              \
-    (rval=readb(DTC_address(reg)), \
-     (((unsigned char) printk("DTC : read register %d at addr %p is: %02x\n"\
-    , (reg), DTC_address(reg), rval)), rval ) )
-
-#define dbNCR5380_write(reg, value) do {                                  \
-    printk("DTC : write %02x to register %d at address %p\n",         \
-            (value), (reg), DTC_address(reg));     \
-    writeb(value, DTC_address(reg));} while(0)
-
-
-#if !(DTCDEBUG & DTCDEBUG_TRANSFER) 
 #define NCR5380_read(reg) (readb(DTC_address(reg)))
 #define NCR5380_write(reg, value) (writeb(value, DTC_address(reg)))
-#else
-#define NCR5380_read(reg) (readb(DTC_address(reg)))
-#define xNCR5380_read(reg)                                             \
-    (((unsigned char) printk("DTC : read register %d at address %p\n"\
-    , (reg), DTC_address(reg))), readb(DTC_address(reg)))
 
-#define NCR5380_write(reg, value) do {                                 \
-    printk("DTC : write %02x to register %d at address %p\n",  \
-           (value), (reg), DTC_address(reg));  \
-    writeb(value, DTC_address(reg));} while(0)
-#endif
+#define NCR5380_dma_xfer_len(instance, cmd, phase) \
+        dtc_dma_xfer_len(cmd)
 
 #define NCR5380_intr                   dtc_intr
 #define NCR5380_queue_command          dtc_queue_command
index f8d2478b11cc7a53453899db7b256543c7f21a33..90091e69302025cbbaaf1d5836ebfe1952b74e6e 100644 (file)
  *     
  */
 
-/* settings for DTC3181E card with only Mustek scanner attached */
-#define USLEEP_POLL    msecs_to_jiffies(10)
-#define USLEEP_SLEEP   msecs_to_jiffies(200)
-#define USLEEP_WAITLONG        msecs_to_jiffies(5000)
-
 #define AUTOPROBE_IRQ
 
 #ifdef CONFIG_SCSI_GENERIC_NCR53C400
-#define NCR53C400_PSEUDO_DMA 1
 #define PSEUDO_DMA
-#define NCR53C400
 #endif
 
 #include <asm/io.h>
-#include <linux/signal.h>
 #include <linux/blkdev.h>
+#include <linux/module.h>
 #include <scsi/scsi_host.h>
 #include "g_NCR5380.h"
 #include "NCR5380.h"
-#include <linux/stat.h>
 #include <linux/init.h>
 #include <linux/ioport.h>
 #include <linux/isapnp.h>
-#include <linux/delay.h>
 #include <linux/interrupt.h>
 
-#define NCR_NOT_SET 0
-static int ncr_irq = NCR_NOT_SET;
-static int ncr_dma = NCR_NOT_SET;
-static int ncr_addr = NCR_NOT_SET;
-static int ncr_5380 = NCR_NOT_SET;
-static int ncr_53c400 = NCR_NOT_SET;
-static int ncr_53c400a = NCR_NOT_SET;
-static int dtc_3181e = NCR_NOT_SET;
+static int ncr_irq;
+static int ncr_dma;
+static int ncr_addr;
+static int ncr_5380;
+static int ncr_53c400;
+static int ncr_53c400a;
+static int dtc_3181e;
+static int hp_c2502;
 
 static struct override {
        NCR5380_map_type NCR5380_map_name;
@@ -121,7 +112,7 @@ static struct override {
 
 static void __init internal_setup(int board, char *str, int *ints)
 {
-       static int commandline_current = 0;
+       static int commandline_current;
        switch (board) {
        case BOARD_NCR5380:
                if (ints[0] != 2 && ints[0] != 3) {
@@ -235,6 +226,30 @@ static int __init do_DTC3181E_setup(char *str)
 
 #endif
 
+#ifndef SCSI_G_NCR5380_MEM
+/*
+ * Configure I/O address of 53C400A or DTC436 by writing magic numbers
+ * to ports 0x779 and 0x379.
+ */
+static void magic_configure(int idx, u8 irq, u8 magic[])
+{
+       u8 cfg = 0;
+
+       outb(magic[0], 0x779);
+       outb(magic[1], 0x379);
+       outb(magic[2], 0x379);
+       outb(magic[3], 0x379);
+       outb(magic[4], 0x379);
+
+       /* allowed IRQs for HP C2502 */
+       if (irq != 2 && irq != 3 && irq != 4 && irq != 5 && irq != 7)
+               irq = 0;
+       if (idx >= 0 && idx <= 7)
+               cfg = 0x80 | idx | (irq << 4);
+       outb(cfg, 0x379);
+}
+#endif
+
 /**
  *     generic_NCR5380_detect  -       look for NCR5380 controllers
  *     @tpnt: the scsi template
@@ -243,19 +258,18 @@ static int __init do_DTC3181E_setup(char *str)
  *     and DTC436(ISAPnP) controllers. If overrides have been set we use
  *     them.
  *
- *     The caller supplied NCR5380_init function is invoked from here, before
- *     the interrupt line is taken.
- *
  *     Locks: none
  */
 
 static int __init generic_NCR5380_detect(struct scsi_host_template *tpnt)
 {
-       static int current_override = 0;
+       static int current_override;
        int count;
        unsigned int *ports;
+       u8 *magic = NULL;
 #ifndef SCSI_G_NCR5380_MEM
        int i;
+       int port_idx = -1;
        unsigned long region_size = 16;
 #endif
        static unsigned int __initdata ncr_53c400a_ports[] = {
@@ -264,27 +278,36 @@ static int __init generic_NCR5380_detect(struct scsi_host_template *tpnt)
        static unsigned int __initdata dtc_3181e_ports[] = {
                0x220, 0x240, 0x280, 0x2a0, 0x2c0, 0x300, 0x320, 0x340, 0
        };
-       int flags = 0;
+       static u8 ncr_53c400a_magic[] __initdata = {    /* 53C400A & DTC436 */
+               0x59, 0xb9, 0xc5, 0xae, 0xa6
+       };
+       static u8 hp_c2502_magic[] __initdata = {       /* HP C2502 */
+               0x0f, 0x22, 0xf0, 0x20, 0x80
+       };
+       int flags;
        struct Scsi_Host *instance;
+       struct NCR5380_hostdata *hostdata;
 #ifdef SCSI_G_NCR5380_MEM
        unsigned long base;
        void __iomem *iomem;
 #endif
 
-       if (ncr_irq != NCR_NOT_SET)
+       if (ncr_irq)
                overrides[0].irq = ncr_irq;
-       if (ncr_dma != NCR_NOT_SET)
+       if (ncr_dma)
                overrides[0].dma = ncr_dma;
-       if (ncr_addr != NCR_NOT_SET)
+       if (ncr_addr)
                overrides[0].NCR5380_map_name = (NCR5380_map_type) ncr_addr;
-       if (ncr_5380 != NCR_NOT_SET)
+       if (ncr_5380)
                overrides[0].board = BOARD_NCR5380;
-       else if (ncr_53c400 != NCR_NOT_SET)
+       else if (ncr_53c400)
                overrides[0].board = BOARD_NCR53C400;
-       else if (ncr_53c400a != NCR_NOT_SET)
+       else if (ncr_53c400a)
                overrides[0].board = BOARD_NCR53C400A;
-       else if (dtc_3181e != NCR_NOT_SET)
+       else if (dtc_3181e)
                overrides[0].board = BOARD_DTC3181E;
+       else if (hp_c2502)
+               overrides[0].board = BOARD_HP_C2502;
 #ifndef SCSI_G_NCR5380_MEM
        if (!current_override && isapnp_present()) {
                struct pnp_dev *dev = NULL;
@@ -318,41 +341,45 @@ static int __init generic_NCR5380_detect(struct scsi_host_template *tpnt)
                }
        }
 #endif
-       tpnt->proc_name = "g_NCR5380";
 
        for (count = 0; current_override < NO_OVERRIDES; ++current_override) {
                if (!(overrides[current_override].NCR5380_map_name))
                        continue;
 
                ports = NULL;
+               flags = 0;
                switch (overrides[current_override].board) {
                case BOARD_NCR5380:
                        flags = FLAG_NO_PSEUDO_DMA;
                        break;
                case BOARD_NCR53C400:
-                       flags = FLAG_NCR53C400;
+#ifdef PSEUDO_DMA
+                       flags = FLAG_NO_DMA_FIXUP;
+#endif
                        break;
                case BOARD_NCR53C400A:
-                       flags = FLAG_NO_PSEUDO_DMA;
+                       flags = FLAG_NO_DMA_FIXUP;
+                       ports = ncr_53c400a_ports;
+                       magic = ncr_53c400a_magic;
+                       break;
+               case BOARD_HP_C2502:
+                       flags = FLAG_NO_DMA_FIXUP;
                        ports = ncr_53c400a_ports;
+                       magic = hp_c2502_magic;
                        break;
                case BOARD_DTC3181E:
-                       flags = FLAG_NO_PSEUDO_DMA | FLAG_DTC3181E;
+                       flags = FLAG_NO_DMA_FIXUP;
                        ports = dtc_3181e_ports;
+                       magic = ncr_53c400a_magic;
                        break;
                }
 
 #ifndef SCSI_G_NCR5380_MEM
-               if (ports) {
+               if (ports && magic) {
                        /* wakeup sequence for the NCR53C400A and DTC3181E */
 
                        /* Disable the adapter and look for a free io port */
-                       outb(0x59, 0x779);
-                       outb(0xb9, 0x379);
-                       outb(0xc5, 0x379);
-                       outb(0xae, 0x379);
-                       outb(0xa6, 0x379);
-                       outb(0x00, 0x379);
+                       magic_configure(-1, 0, magic);
 
                        if (overrides[current_override].NCR5380_map_name != PORT_AUTO)
                                for (i = 0; ports[i]; i++) {
@@ -371,17 +398,12 @@ static int __init generic_NCR5380_detect(struct scsi_host_template *tpnt)
                                }
                        if (ports[i]) {
                                /* At this point we have our region reserved */
-                               outb(0x59, 0x779);
-                               outb(0xb9, 0x379);
-                               outb(0xc5, 0x379);
-                               outb(0xae, 0x379);
-                               outb(0xa6, 0x379);
-                               outb(0x80 | i, 0x379);  /* set io port to be used */
+                               magic_configure(i, 0, magic); /* no IRQ yet */
                                outb(0xc0, ports[i] + 9);
                                if (inb(ports[i] + 9) != 0x80)
                                        continue;
-                               else
-                                       overrides[current_override].NCR5380_map_name = ports[i];
+                               overrides[current_override].NCR5380_map_name = ports[i];
+                               port_idx = i;
                        } else
                                continue;
                }
@@ -403,24 +425,65 @@ static int __init generic_NCR5380_detect(struct scsi_host_template *tpnt)
                }
 #endif
                instance = scsi_register(tpnt, sizeof(struct NCR5380_hostdata));
-               if (instance == NULL) {
-#ifndef SCSI_G_NCR5380_MEM
-                       release_region(overrides[current_override].NCR5380_map_name, region_size);
-#else
-                       iounmap(iomem);
-                       release_mem_region(base, NCR5380_region_size);
-#endif
-                       continue;
-               }
+               if (instance == NULL)
+                       goto out_release;
+               hostdata = shost_priv(instance);
 
-               instance->NCR5380_instance_name = overrides[current_override].NCR5380_map_name;
 #ifndef SCSI_G_NCR5380_MEM
+               instance->io_port = overrides[current_override].NCR5380_map_name;
                instance->n_io_port = region_size;
+               hostdata->io_width = 1; /* 8-bit PDMA by default */
+
+               /*
+                * On NCR53C400 boards, NCR5380 registers are mapped 8 past
+                * the base address.
+                */
+               switch (overrides[current_override].board) {
+               case BOARD_NCR53C400:
+                       instance->io_port += 8;
+                       hostdata->c400_ctl_status = 0;
+                       hostdata->c400_blk_cnt = 1;
+                       hostdata->c400_host_buf = 4;
+                       break;
+               case BOARD_DTC3181E:
+                       hostdata->io_width = 2; /* 16-bit PDMA */
+                       /* fall through */
+               case BOARD_NCR53C400A:
+               case BOARD_HP_C2502:
+                       hostdata->c400_ctl_status = 9;
+                       hostdata->c400_blk_cnt = 10;
+                       hostdata->c400_host_buf = 8;
+                       break;
+               }
 #else
-               ((struct NCR5380_hostdata *)instance->hostdata)->iomem = iomem;
+               instance->base = overrides[current_override].NCR5380_map_name;
+               hostdata->iomem = iomem;
+               switch (overrides[current_override].board) {
+               case BOARD_NCR53C400:
+                       hostdata->c400_ctl_status = 0x100;
+                       hostdata->c400_blk_cnt = 0x101;
+                       hostdata->c400_host_buf = 0x104;
+                       break;
+               case BOARD_DTC3181E:
+               case BOARD_NCR53C400A:
+               case BOARD_HP_C2502:
+                       pr_err(DRV_MODULE_NAME ": unknown register offsets\n");
+                       goto out_unregister;
+               }
 #endif
 
-               NCR5380_init(instance, flags);
+               if (NCR5380_init(instance, flags))
+                       goto out_unregister;
+
+               switch (overrides[current_override].board) {
+               case BOARD_NCR53C400:
+               case BOARD_DTC3181E:
+               case BOARD_NCR53C400A:
+               case BOARD_HP_C2502:
+                       NCR5380_write(hostdata->c400_ctl_status, CSR_BASE);
+               }
+
+               NCR5380_maybe_reset_bus(instance);
 
                if (overrides[current_override].irq != IRQ_AUTO)
                        instance->irq = overrides[current_override].irq;
@@ -431,12 +494,18 @@ static int __init generic_NCR5380_detect(struct scsi_host_template *tpnt)
                if (instance->irq == 255)
                        instance->irq = NO_IRQ;
 
-               if (instance->irq != NO_IRQ)
+               if (instance->irq != NO_IRQ) {
+#ifndef SCSI_G_NCR5380_MEM
+                       /* set IRQ for HP C2502 */
+                       if (overrides[current_override].board == BOARD_HP_C2502)
+                               magic_configure(port_idx, instance->irq, magic);
+#endif
                        if (request_irq(instance->irq, generic_NCR5380_intr,
                                        0, "NCR5380", instance)) {
                                printk(KERN_WARNING "scsi%d : IRQ%d not free, interrupts disabled\n", instance->host_no, instance->irq);
                                instance->irq = NO_IRQ;
                        }
+               }
 
                if (instance->irq == NO_IRQ) {
                        printk(KERN_INFO "scsi%d : interrupts not enabled. for better interactive performance,\n", instance->host_no);
@@ -447,6 +516,17 @@ static int __init generic_NCR5380_detect(struct scsi_host_template *tpnt)
                ++count;
        }
        return count;
+
+out_unregister:
+       scsi_unregister(instance);
+out_release:
+#ifndef SCSI_G_NCR5380_MEM
+       release_region(overrides[current_override].NCR5380_map_name, region_size);
+#else
+       iounmap(iomem);
+       release_mem_region(base, NCR5380_region_size);
+#endif
+       return count;
 }
 
 /**
@@ -460,21 +540,15 @@ static int __init generic_NCR5380_detect(struct scsi_host_template *tpnt)
  
 static int generic_NCR5380_release_resources(struct Scsi_Host *instance)
 {
-       NCR5380_local_declare();
-       NCR5380_setup(instance);
-       
        if (instance->irq != NO_IRQ)
                free_irq(instance->irq, instance);
        NCR5380_exit(instance);
-
 #ifndef SCSI_G_NCR5380_MEM
-       release_region(instance->NCR5380_instance_name, instance->n_io_port);
+       release_region(instance->io_port, instance->n_io_port);
 #else
        iounmap(((struct NCR5380_hostdata *)instance->hostdata)->iomem);
-       release_mem_region(instance->NCR5380_instance_name, NCR5380_region_size);
+       release_mem_region(instance->base, NCR5380_region_size);
 #endif
-
-
        return 0;
 }
 
@@ -507,7 +581,7 @@ generic_NCR5380_biosparam(struct scsi_device *sdev, struct block_device *bdev,
 }
 #endif
 
-#ifdef NCR53C400_PSEUDO_DMA
+#ifdef PSEUDO_DMA
 
 /**
  *     NCR5380_pread           -       pseudo DMA read
@@ -521,75 +595,68 @@ generic_NCR5380_biosparam(struct scsi_device *sdev, struct block_device *bdev,
  
 static inline int NCR5380_pread(struct Scsi_Host *instance, unsigned char *dst, int len)
 {
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
        int blocks = len / 128;
        int start = 0;
-       int bl;
-
-       NCR5380_local_declare();
-       NCR5380_setup(instance);
 
-       NCR5380_write(C400_CONTROL_STATUS_REG, CSR_BASE | CSR_TRANS_DIR);
-       NCR5380_write(C400_BLOCK_COUNTER_REG, blocks);
+       NCR5380_write(hostdata->c400_ctl_status, CSR_BASE | CSR_TRANS_DIR);
+       NCR5380_write(hostdata->c400_blk_cnt, blocks);
        while (1) {
-               if ((bl = NCR5380_read(C400_BLOCK_COUNTER_REG)) == 0) {
+               if (NCR5380_read(hostdata->c400_blk_cnt) == 0)
                        break;
-               }
-               if (NCR5380_read(C400_CONTROL_STATUS_REG) & CSR_GATED_53C80_IRQ) {
+               if (NCR5380_read(hostdata->c400_ctl_status) & CSR_GATED_53C80_IRQ) {
                        printk(KERN_ERR "53C400r: Got 53C80_IRQ start=%d, blocks=%d\n", start, blocks);
                        return -1;
                }
-               while (NCR5380_read(C400_CONTROL_STATUS_REG) & CSR_HOST_BUF_NOT_RDY);
+               while (NCR5380_read(hostdata->c400_ctl_status) & CSR_HOST_BUF_NOT_RDY)
+                       ; /* FIXME - no timeout */
 
 #ifndef SCSI_G_NCR5380_MEM
-               {
-                       int i;
-                       for (i = 0; i < 128; i++)
-                               dst[start + i] = NCR5380_read(C400_HOST_BUFFER);
-               }
+               if (hostdata->io_width == 2)
+                       insw(instance->io_port + hostdata->c400_host_buf,
+                                                       dst + start, 64);
+               else
+                       insb(instance->io_port + hostdata->c400_host_buf,
+                                                       dst + start, 128);
 #else
                /* implies SCSI_G_NCR5380_MEM */
-               memcpy_fromio(dst + start, iomem + NCR53C400_host_buffer, 128);
+               memcpy_fromio(dst + start,
+                             hostdata->iomem + NCR53C400_host_buffer, 128);
 #endif
                start += 128;
                blocks--;
        }
 
        if (blocks) {
-               while (NCR5380_read(C400_CONTROL_STATUS_REG) & CSR_HOST_BUF_NOT_RDY)
-               {
-                       // FIXME - no timeout
-               }
+               while (NCR5380_read(hostdata->c400_ctl_status) & CSR_HOST_BUF_NOT_RDY)
+                       ; /* FIXME - no timeout */
 
 #ifndef SCSI_G_NCR5380_MEM
-               {
-                       int i;  
-                       for (i = 0; i < 128; i++)
-                               dst[start + i] = NCR5380_read(C400_HOST_BUFFER);
-               }
+               if (hostdata->io_width == 2)
+                       insw(instance->io_port + hostdata->c400_host_buf,
+                                                       dst + start, 64);
+               else
+                       insb(instance->io_port + hostdata->c400_host_buf,
+                                                       dst + start, 128);
 #else
                /* implies SCSI_G_NCR5380_MEM */
-               memcpy_fromio(dst + start, iomem + NCR53C400_host_buffer, 128);
+               memcpy_fromio(dst + start,
+                             hostdata->iomem + NCR53C400_host_buffer, 128);
 #endif
                start += 128;
                blocks--;
        }
 
-       if (!(NCR5380_read(C400_CONTROL_STATUS_REG) & CSR_GATED_53C80_IRQ))
+       if (!(NCR5380_read(hostdata->c400_ctl_status) & CSR_GATED_53C80_IRQ))
                printk("53C400r: no 53C80 gated irq after transfer");
 
-#if 0
-       /*
-        *      DON'T DO THIS - THEY NEVER ARRIVE!
-        */
-       printk("53C400r: Waiting for 53C80 registers\n");
-       while (NCR5380_read(C400_CONTROL_STATUS_REG) & CSR_53C80_REG)
+       /* wait for 53C80 registers to be available */
+       while (!(NCR5380_read(hostdata->c400_ctl_status) & CSR_53C80_REG))
                ;
-#endif
+
        if (!(NCR5380_read(BUS_AND_STATUS_REG) & BASR_END_DMA_TRANSFER))
                printk(KERN_ERR "53C400r: no end dma signal\n");
                
-       NCR5380_write(MODE_REG, MR_BASE);
-       NCR5380_read(RESET_PARITY_INTERRUPT_REG);
        return 0;
 }
 
@@ -605,89 +672,91 @@ static inline int NCR5380_pread(struct Scsi_Host *instance, unsigned char *dst,
 
 static inline int NCR5380_pwrite(struct Scsi_Host *instance, unsigned char *src, int len)
 {
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
        int blocks = len / 128;
        int start = 0;
-       int bl;
-       int i;
 
-       NCR5380_local_declare();
-       NCR5380_setup(instance);
-
-       NCR5380_write(C400_CONTROL_STATUS_REG, CSR_BASE);
-       NCR5380_write(C400_BLOCK_COUNTER_REG, blocks);
+       NCR5380_write(hostdata->c400_ctl_status, CSR_BASE);
+       NCR5380_write(hostdata->c400_blk_cnt, blocks);
        while (1) {
-               if (NCR5380_read(C400_CONTROL_STATUS_REG) & CSR_GATED_53C80_IRQ) {
+               if (NCR5380_read(hostdata->c400_ctl_status) & CSR_GATED_53C80_IRQ) {
                        printk(KERN_ERR "53C400w: Got 53C80_IRQ start=%d, blocks=%d\n", start, blocks);
                        return -1;
                }
 
-               if ((bl = NCR5380_read(C400_BLOCK_COUNTER_REG)) == 0) {
+               if (NCR5380_read(hostdata->c400_blk_cnt) == 0)
                        break;
-               }
-               while (NCR5380_read(C400_CONTROL_STATUS_REG) & CSR_HOST_BUF_NOT_RDY)
+               while (NCR5380_read(hostdata->c400_ctl_status) & CSR_HOST_BUF_NOT_RDY)
                        ; // FIXME - timeout
 #ifndef SCSI_G_NCR5380_MEM
-               {
-                       for (i = 0; i < 128; i++)
-                               NCR5380_write(C400_HOST_BUFFER, src[start + i]);
-               }
+               if (hostdata->io_width == 2)
+                       outsw(instance->io_port + hostdata->c400_host_buf,
+                                                       src + start, 64);
+               else
+                       outsb(instance->io_port + hostdata->c400_host_buf,
+                                                       src + start, 128);
 #else
                /* implies SCSI_G_NCR5380_MEM */
-               memcpy_toio(iomem + NCR53C400_host_buffer, src + start, 128);
+               memcpy_toio(hostdata->iomem + NCR53C400_host_buffer,
+                           src + start, 128);
 #endif
                start += 128;
                blocks--;
        }
        if (blocks) {
-               while (NCR5380_read(C400_CONTROL_STATUS_REG) & CSR_HOST_BUF_NOT_RDY)
+               while (NCR5380_read(hostdata->c400_ctl_status) & CSR_HOST_BUF_NOT_RDY)
                        ; // FIXME - no timeout
 
 #ifndef SCSI_G_NCR5380_MEM
-               {
-                       for (i = 0; i < 128; i++)
-                               NCR5380_write(C400_HOST_BUFFER, src[start + i]);
-               }
+               if (hostdata->io_width == 2)
+                       outsw(instance->io_port + hostdata->c400_host_buf,
+                                                       src + start, 64);
+               else
+                       outsb(instance->io_port + hostdata->c400_host_buf,
+                                                       src + start, 128);
 #else
                /* implies SCSI_G_NCR5380_MEM */
-               memcpy_toio(iomem + NCR53C400_host_buffer, src + start, 128);
+               memcpy_toio(hostdata->iomem + NCR53C400_host_buffer,
+                           src + start, 128);
 #endif
                start += 128;
                blocks--;
        }
 
-#if 0
-       printk("53C400w: waiting for registers to be available\n");
-       THEY NEVER DO ! while (NCR5380_read(C400_CONTROL_STATUS_REG) & CSR_53C80_REG);
-       printk("53C400w: Got em\n");
-#endif
-
-       /* Let's wait for this instead - could be ugly */
-       /* All documentation says to check for this. Maybe my hardware is too
-        * fast. Waiting for it seems to work fine! KLL
-        */
-       while (!(i = NCR5380_read(C400_CONTROL_STATUS_REG) & CSR_GATED_53C80_IRQ))
-               ;       // FIXME - no timeout
-
-       /*
-        * I know. i is certainly != 0 here but the loop is new. See previous
-        * comment.
-        */
-       if (i) {
-               if (!((i = NCR5380_read(BUS_AND_STATUS_REG)) & BASR_END_DMA_TRANSFER))
-                       printk(KERN_ERR "53C400w: No END OF DMA bit - WHOOPS! BASR=%0x\n", i);
-       } else
-               printk(KERN_ERR "53C400w: no 53C80 gated irq after transfer (last block)\n");
+       /* wait for 53C80 registers to be available */
+       while (!(NCR5380_read(hostdata->c400_ctl_status) & CSR_53C80_REG)) {
+               udelay(4); /* DTC436 chip hangs without this */
+               /* FIXME - no timeout */
+       }
 
-#if 0
        if (!(NCR5380_read(BUS_AND_STATUS_REG) & BASR_END_DMA_TRANSFER)) {
                printk(KERN_ERR "53C400w: no end dma signal\n");
        }
-#endif
+
        while (!(NCR5380_read(TARGET_COMMAND_REG) & TCR_LAST_BYTE_SENT))
                ;       // TIMEOUT
        return 0;
 }
-#endif                         /* PSEUDO_DMA */
+
+static int generic_NCR5380_dma_xfer_len(struct scsi_cmnd *cmd)
+{
+       int transfersize = cmd->transfersize;
+
+       /* Limit transfers to 32K, for xx400 & xx406
+        * pseudoDMA that transfers in 128 bytes blocks.
+        */
+       if (transfersize > 32 * 1024 && cmd->SCp.this_residual &&
+           !(cmd->SCp.this_residual % transfersize))
+               transfersize = 32 * 1024;
+
+       /* 53C400 datasheet: non-modulo-128-byte transfers should use PIO */
+       if (transfersize % 128)
+               transfersize = 0;
+
+       return transfersize;
+}
+
+#endif /* PSEUDO_DMA */
 
 /*
  *     Include the NCR5380 core code that we build our driver around   
@@ -696,22 +765,24 @@ static inline int NCR5380_pwrite(struct Scsi_Host *instance, unsigned char *src,
 #include "NCR5380.c"
 
 static struct scsi_host_template driver_template = {
-       .show_info              = generic_NCR5380_show_info,
-       .name                   = "Generic NCR5380/NCR53C400 SCSI",
-       .detect                 = generic_NCR5380_detect,
-       .release                = generic_NCR5380_release_resources,
-       .info                   = generic_NCR5380_info,
-       .queuecommand           = generic_NCR5380_queue_command,
+       .proc_name              = DRV_MODULE_NAME,
+       .name                   = "Generic NCR5380/NCR53C400 SCSI",
+       .detect                 = generic_NCR5380_detect,
+       .release                = generic_NCR5380_release_resources,
+       .info                   = generic_NCR5380_info,
+       .queuecommand           = generic_NCR5380_queue_command,
        .eh_abort_handler       = generic_NCR5380_abort,
        .eh_bus_reset_handler   = generic_NCR5380_bus_reset,
-       .bios_param             = NCR5380_BIOSPARAM,
-       .can_queue              = CAN_QUEUE,
-        .this_id               = 7,
-        .sg_tablesize          = SG_ALL,
-       .cmd_per_lun            = CMD_PER_LUN,
-        .use_clustering                = DISABLE_CLUSTERING,
+       .bios_param             = NCR5380_BIOSPARAM,
+       .can_queue              = 16,
+       .this_id                = 7,
+       .sg_tablesize           = SG_ALL,
+       .cmd_per_lun            = 2,
+       .use_clustering         = DISABLE_CLUSTERING,
+       .cmd_size               = NCR5380_CMD_SIZE,
+       .max_sectors            = 128,
 };
-#include <linux/module.h>
+
 #include "scsi_module.c"
 
 module_param(ncr_irq, int, 0);
@@ -721,6 +792,7 @@ module_param(ncr_5380, int, 0);
 module_param(ncr_53c400, int, 0);
 module_param(ncr_53c400a, int, 0);
 module_param(dtc_3181e, int, 0);
+module_param(hp_c2502, int, 0);
 MODULE_LICENSE("GPL");
 
 #if !defined(SCSI_G_NCR5380_MEM) && defined(MODULE)
index bea1a3b9b862dfd4b944072af3fbab22344bd66c..6f3d2ac4f185c2ff64f6c6a41721f604f08edab7 100644 (file)
 #ifndef GENERIC_NCR5380_H
 #define GENERIC_NCR5380_H
 
-#ifdef NCR53C400
+#ifdef CONFIG_SCSI_GENERIC_NCR53C400
 #define BIOSPARAM
 #define NCR5380_BIOSPARAM generic_NCR5380_biosparam
 #else
 #define NCR5380_BIOSPARAM NULL
 #endif
 
-#ifndef ASM
-
-#ifndef CMD_PER_LUN
-#define CMD_PER_LUN 2
-#endif
-
-#ifndef CAN_QUEUE
-#define CAN_QUEUE 16
-#endif
-
 #define __STRVAL(x) #x
 #define STRVAL(x) __STRVAL(x)
 
 #ifndef SCSI_G_NCR5380_MEM
+#define DRV_MODULE_NAME "g_NCR5380"
 
-#define NCR5380_map_config port
 #define NCR5380_map_type int
 #define NCR5380_map_name port
-#define NCR5380_instance_name io_port
-#define NCR53C400_register_offset 0
-#define NCR53C400_address_adjust 8
 
-#ifdef NCR53C400
+#ifdef CONFIG_SCSI_GENERIC_NCR53C400
 #define NCR5380_region_size 16
 #else
 #define NCR5380_region_size 8
 #endif
 
-#define NCR5380_read(reg) (inb(NCR5380_map_name + (reg)))
-#define NCR5380_write(reg, value) (outb((value), (NCR5380_map_name + (reg))))
+#define NCR5380_read(reg) \
+       inb(instance->io_port + (reg))
+#define NCR5380_write(reg, value) \
+       outb(value, instance->io_port + (reg))
 
 #define NCR5380_implementation_fields \
-    NCR5380_map_type NCR5380_map_name
-
-#define NCR5380_local_declare() \
-    register NCR5380_implementation_fields
-
-#define NCR5380_setup(instance) \
-    NCR5380_map_name = (NCR5380_map_type)((instance)->NCR5380_instance_name)
+       int c400_ctl_status; \
+       int c400_blk_cnt; \
+       int c400_host_buf; \
+       int io_width;
 
 #else 
 /* therefore SCSI_G_NCR5380_MEM */
+#define DRV_MODULE_NAME "g_NCR5380_mmio"
 
-#define NCR5380_map_config memory
 #define NCR5380_map_type unsigned long
 #define NCR5380_map_name base
-#define NCR5380_instance_name base
-#define NCR53C400_register_offset 0x108
-#define NCR53C400_address_adjust 0
 #define NCR53C400_mem_base 0x3880
 #define NCR53C400_host_buffer 0x3900
 #define NCR5380_region_size 0x3a00
 
-#define NCR5380_read(reg) readb(iomem + NCR53C400_mem_base + (reg))
-#define NCR5380_write(reg, value) writeb(value, iomem + NCR53C400_mem_base + (reg))
+#define NCR5380_read(reg) \
+       readb(((struct NCR5380_hostdata *)shost_priv(instance))->iomem + \
+             NCR53C400_mem_base + (reg))
+#define NCR5380_write(reg, value) \
+       writeb(value, ((struct NCR5380_hostdata *)shost_priv(instance))->iomem + \
+              NCR53C400_mem_base + (reg))
 
 #define NCR5380_implementation_fields \
-    NCR5380_map_type NCR5380_map_name; \
-    void __iomem *iomem;
-
-#define NCR5380_local_declare() \
-    register void __iomem *iomem
-
-#define NCR5380_setup(instance) \
-    iomem = (((struct NCR5380_hostdata *)(instance)->hostdata)->iomem)
+       void __iomem *iomem; \
+       int c400_ctl_status; \
+       int c400_blk_cnt; \
+       int c400_host_buf;
 
 #endif
 
+#define NCR5380_dma_xfer_len(instance, cmd, phase) \
+        generic_NCR5380_dma_xfer_len(cmd)
+
 #define NCR5380_intr generic_NCR5380_intr
 #define NCR5380_queue_command generic_NCR5380_queue_command
 #define NCR5380_abort generic_NCR5380_abort
 #define BOARD_NCR53C400        1
 #define BOARD_NCR53C400A 2
 #define BOARD_DTC3181E 3
+#define BOARD_HP_C2502 4
 
-#endif /* ndef ASM */
 #endif /* GENERIC_NCR5380_H */
 
index d54381149c0d513b16cb355b5ec632636073321c..057fdeb720acec997f4a4a5952318c58ab2f0b3e 100644 (file)
 /* ITCT header */
 /* qw0 */
 #define ITCT_HDR_DEV_TYPE_OFF          0
-#define ITCT_HDR_DEV_TYPE_MSK          (0x3 << ITCT_HDR_DEV_TYPE_OFF)
+#define ITCT_HDR_DEV_TYPE_MSK          (0x3ULL << ITCT_HDR_DEV_TYPE_OFF)
 #define ITCT_HDR_VALID_OFF             2
-#define ITCT_HDR_VALID_MSK             (0x1 << ITCT_HDR_VALID_OFF)
-#define ITCT_HDR_BREAK_REPLY_ENA_OFF   3
-#define ITCT_HDR_BREAK_REPLY_ENA_MSK   (0x1 << ITCT_HDR_BREAK_REPLY_ENA_OFF)
+#define ITCT_HDR_VALID_MSK             (0x1ULL << ITCT_HDR_VALID_OFF)
 #define ITCT_HDR_AWT_CONTROL_OFF       4
-#define ITCT_HDR_AWT_CONTROL_MSK       (0x1 << ITCT_HDR_AWT_CONTROL_OFF)
+#define ITCT_HDR_AWT_CONTROL_MSK       (0x1ULL << ITCT_HDR_AWT_CONTROL_OFF)
 #define ITCT_HDR_MAX_CONN_RATE_OFF     5
-#define ITCT_HDR_MAX_CONN_RATE_MSK     (0xf << ITCT_HDR_MAX_CONN_RATE_OFF)
+#define ITCT_HDR_MAX_CONN_RATE_MSK     (0xfULL << ITCT_HDR_MAX_CONN_RATE_OFF)
 #define ITCT_HDR_VALID_LINK_NUM_OFF    9
-#define ITCT_HDR_VALID_LINK_NUM_MSK    (0xf << ITCT_HDR_VALID_LINK_NUM_OFF)
+#define ITCT_HDR_VALID_LINK_NUM_MSK    (0xfULL << ITCT_HDR_VALID_LINK_NUM_OFF)
 #define ITCT_HDR_PORT_ID_OFF           13
-#define ITCT_HDR_PORT_ID_MSK           (0x7 << ITCT_HDR_PORT_ID_OFF)
+#define ITCT_HDR_PORT_ID_MSK           (0x7ULL << ITCT_HDR_PORT_ID_OFF)
 #define ITCT_HDR_SMP_TIMEOUT_OFF       16
-#define ITCT_HDR_SMP_TIMEOUT_MSK       (0xffff << ITCT_HDR_SMP_TIMEOUT_OFF)
-#define ITCT_HDR_MAX_BURST_BYTES_OFF   16
-#define ITCT_HDR_MAX_BURST_BYTES_MSK   (0xffffffff << \
-                                       ITCT_MAX_BURST_BYTES_OFF)
+#define ITCT_HDR_SMP_TIMEOUT_MSK       (0xffffULL << ITCT_HDR_SMP_TIMEOUT_OFF)
 /* qw1 */
 #define ITCT_HDR_MAX_SAS_ADDR_OFF      0
 #define ITCT_HDR_MAX_SAS_ADDR_MSK      (0xffffffffffffffff << \
                                        ITCT_HDR_MAX_SAS_ADDR_OFF)
 /* qw2 */
 #define ITCT_HDR_IT_NEXUS_LOSS_TL_OFF  0
-#define ITCT_HDR_IT_NEXUS_LOSS_TL_MSK  (0xffff << \
+#define ITCT_HDR_IT_NEXUS_LOSS_TL_MSK  (0xffffULL << \
                                        ITCT_HDR_IT_NEXUS_LOSS_TL_OFF)
 #define ITCT_HDR_BUS_INACTIVE_TL_OFF   16
-#define ITCT_HDR_BUS_INACTIVE_TL_MSK   (0xffff << \
+#define ITCT_HDR_BUS_INACTIVE_TL_MSK   (0xffffULL << \
                                        ITCT_HDR_BUS_INACTIVE_TL_OFF)
 #define ITCT_HDR_MAX_CONN_TL_OFF       32
-#define ITCT_HDR_MAX_CONN_TL_MSK       (0xffff << \
+#define ITCT_HDR_MAX_CONN_TL_MSK       (0xffffULL << \
                                        ITCT_HDR_MAX_CONN_TL_OFF)
 #define ITCT_HDR_REJ_OPEN_TL_OFF       48
-#define ITCT_HDR_REJ_OPEN_TL_MSK       (0xffff << \
-                                       ITCT_REJ_OPEN_TL_OFF)
+#define ITCT_HDR_REJ_OPEN_TL_MSK       (0xffffULL << \
+                                       ITCT_HDR_REJ_OPEN_TL_OFF)
 
 /* Err record header */
 #define ERR_HDR_DMA_TX_ERR_TYPE_OFF    0
@@ -533,10 +528,10 @@ static void setup_itct_v1_hw(struct hisi_hba *hisi_hba,
        itct->sas_addr = __swab64(itct->sas_addr);
 
        /* qw2 */
-       itct->qw2 = cpu_to_le64((500 < ITCT_HDR_IT_NEXUS_LOSS_TL_OFF) |
-                               (0xff00 < ITCT_HDR_BUS_INACTIVE_TL_OFF) |
-                               (0xff00 < ITCT_HDR_MAX_CONN_TL_OFF) |
-                               (0xff00 < ITCT_HDR_REJ_OPEN_TL_OFF));
+       itct->qw2 = cpu_to_le64((500ULL << ITCT_HDR_IT_NEXUS_LOSS_TL_OFF) |
+                               (0xff00ULL << ITCT_HDR_BUS_INACTIVE_TL_OFF) |
+                               (0xff00ULL << ITCT_HDR_MAX_CONN_TL_OFF) |
+                               (0xff00ULL << ITCT_HDR_REJ_OPEN_TL_OFF));
 }
 
 static void free_device_v1_hw(struct hisi_hba *hisi_hba,
@@ -544,7 +539,8 @@ static void free_device_v1_hw(struct hisi_hba *hisi_hba,
 {
        u64 dev_id = sas_dev->device_id;
        struct hisi_sas_itct *itct = &hisi_hba->itct[dev_id];
-       u32 qw0, reg_val = hisi_sas_read32(hisi_hba, CFG_AGING_TIME);
+       u64 qw0;
+       u32 reg_val = hisi_sas_read32(hisi_hba, CFG_AGING_TIME);
 
        reg_val |= CFG_AGING_TIME_ITCT_REL_MSK;
        hisi_sas_write32(hisi_hba, CFG_AGING_TIME, reg_val);
index 4e1a632ccf162ea52365b84e71aab19b44619280..f8b88fa78e6274f8b5956d6ea6a3f057881132f5 100644 (file)
@@ -43,6 +43,7 @@ typedef struct {
        unsigned dp:1;          /* Data phase present           */
        unsigned rd:1;          /* Read data in data phase      */
        unsigned wanted:1;      /* Parport sharing busy flag    */
+       unsigned int dev_no;    /* Device number                */
        wait_queue_head_t *waiting;
        struct Scsi_Host *host;
        struct list_head list;
@@ -1120,15 +1121,40 @@ static struct scsi_host_template imm_template = {
 
 static LIST_HEAD(imm_hosts);
 
+/*
+ * Finds the first available device number that can be alloted to the
+ * new imm device and returns the address of the previous node so that
+ * we can add to the tail and have a list in the ascending order.
+ */
+
+static inline imm_struct *find_parent(void)
+{
+       imm_struct *dev, *par = NULL;
+       unsigned int cnt = 0;
+
+       if (list_empty(&imm_hosts))
+               return NULL;
+
+       list_for_each_entry(dev, &imm_hosts, list) {
+               if (dev->dev_no != cnt)
+                       return par;
+               cnt++;
+               par = dev;
+       }
+
+       return par;
+}
+
 static int __imm_attach(struct parport *pb)
 {
        struct Scsi_Host *host;
-       imm_struct *dev;
+       imm_struct *dev, *temp;
        DECLARE_WAIT_QUEUE_HEAD_ONSTACK(waiting);
        DEFINE_WAIT(wait);
        int ports;
        int modes, ppb;
        int err = -ENOMEM;
+       struct pardev_cb imm_cb;
 
        init_waitqueue_head(&waiting);
 
@@ -1141,9 +1167,15 @@ static int __imm_attach(struct parport *pb)
        dev->mode = IMM_AUTODETECT;
        INIT_LIST_HEAD(&dev->list);
 
-       dev->dev = parport_register_device(pb, "imm", NULL, imm_wakeup,
-                                               NULL, 0, dev);
+       temp = find_parent();
+       if (temp)
+               dev->dev_no = temp->dev_no + 1;
+
+       memset(&imm_cb, 0, sizeof(imm_cb));
+       imm_cb.private = dev;
+       imm_cb.wakeup = imm_wakeup;
 
+       dev->dev = parport_register_dev_model(pb, "imm", &imm_cb, dev->dev_no);
        if (!dev->dev)
                goto out;
 
@@ -1207,7 +1239,10 @@ static int __imm_attach(struct parport *pb)
        host->unique_id = pb->number;
        *(imm_struct **)&host->hostdata = dev;
        dev->host = host;
-       list_add_tail(&dev->list, &imm_hosts);
+       if (!temp)
+               list_add_tail(&dev->list, &imm_hosts);
+       else
+               list_add_tail(&dev->list, &temp->list);
        err = scsi_add_host(host, NULL);
        if (err)
                goto out2;
@@ -1245,9 +1280,10 @@ static void imm_detach(struct parport *pb)
 }
 
 static struct parport_driver imm_driver = {
-       .name   = "imm",
-       .attach = imm_attach,
-       .detach = imm_detach,
+       .name           = "imm",
+       .match_port     = imm_attach,
+       .detach         = imm_detach,
+       .devmodel       = true,
 };
 
 static int __init imm_driver_init(void)
index 536cd5a8042245f4ad46929a256098e047d745dd..3b3e0998fa6e72b71f5417c4f942ff9bb67caee0 100644 (file)
@@ -3638,7 +3638,7 @@ static struct device_attribute ipr_ioa_reset_attr = {
        .store = ipr_store_reset_adapter
 };
 
-static int ipr_iopoll(struct blk_iopoll *iop, int budget);
+static int ipr_iopoll(struct irq_poll *iop, int budget);
  /**
  * ipr_show_iopoll_weight - Show ipr polling mode
  * @dev:       class device struct
@@ -3681,34 +3681,33 @@ static ssize_t ipr_store_iopoll_weight(struct device *dev,
        int i;
 
        if (!ioa_cfg->sis64) {
-               dev_info(&ioa_cfg->pdev->dev, "blk-iopoll not supported on this adapter\n");
+               dev_info(&ioa_cfg->pdev->dev, "irq_poll not supported on this adapter\n");
                return -EINVAL;
        }
        if (kstrtoul(buf, 10, &user_iopoll_weight))
                return -EINVAL;
 
        if (user_iopoll_weight > 256) {
-               dev_info(&ioa_cfg->pdev->dev, "Invalid blk-iopoll weight. It must be less than 256\n");
+               dev_info(&ioa_cfg->pdev->dev, "Invalid irq_poll weight. It must be less than 256\n");
                return -EINVAL;
        }
 
        if (user_iopoll_weight == ioa_cfg->iopoll_weight) {
-               dev_info(&ioa_cfg->pdev->dev, "Current blk-iopoll weight has the same weight\n");
+               dev_info(&ioa_cfg->pdev->dev, "Current irq_poll weight has the same weight\n");
                return strlen(buf);
        }
 
        if (ioa_cfg->iopoll_weight && ioa_cfg->sis64 && ioa_cfg->nvectors > 1) {
                for (i = 1; i < ioa_cfg->hrrq_num; i++)
-                       blk_iopoll_disable(&ioa_cfg->hrrq[i].iopoll);
+                       irq_poll_disable(&ioa_cfg->hrrq[i].iopoll);
        }
 
        spin_lock_irqsave(shost->host_lock, lock_flags);
        ioa_cfg->iopoll_weight = user_iopoll_weight;
        if (ioa_cfg->iopoll_weight && ioa_cfg->sis64 && ioa_cfg->nvectors > 1) {
                for (i = 1; i < ioa_cfg->hrrq_num; i++) {
-                       blk_iopoll_init(&ioa_cfg->hrrq[i].iopoll,
+                       irq_poll_init(&ioa_cfg->hrrq[i].iopoll,
                                        ioa_cfg->iopoll_weight, ipr_iopoll);
-                       blk_iopoll_enable(&ioa_cfg->hrrq[i].iopoll);
                }
        }
        spin_unlock_irqrestore(shost->host_lock, lock_flags);
@@ -4003,13 +4002,12 @@ static ssize_t ipr_store_update_fw(struct device *dev,
        struct ipr_sglist *sglist;
        char fname[100];
        char *src;
-       int len, result, dnld_size;
+       int result, dnld_size;
 
        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;
 
-       len = snprintf(fname, 99, "%s", buf);
-       fname[len-1] = '\0';
+       snprintf(fname, sizeof(fname), "%s", buf);
 
        if (request_firmware(&fw_entry, fname, &ioa_cfg->pdev->dev)) {
                dev_err(&ioa_cfg->pdev->dev, "Firmware file %s not found\n", fname);
@@ -5569,7 +5567,7 @@ static int ipr_process_hrrq(struct ipr_hrr_queue *hrr_queue, int budget,
        return num_hrrq;
 }
 
-static int ipr_iopoll(struct blk_iopoll *iop, int budget)
+static int ipr_iopoll(struct irq_poll *iop, int budget)
 {
        struct ipr_ioa_cfg *ioa_cfg;
        struct ipr_hrr_queue *hrrq;
@@ -5585,7 +5583,7 @@ static int ipr_iopoll(struct blk_iopoll *iop, int budget)
        completed_ops = ipr_process_hrrq(hrrq, budget, &doneq);
 
        if (completed_ops < budget)
-               blk_iopoll_complete(iop);
+               irq_poll_complete(iop);
        spin_unlock_irqrestore(hrrq->lock, hrrq_flags);
 
        list_for_each_entry_safe(ipr_cmd, temp, &doneq, queue) {
@@ -5693,8 +5691,7 @@ static irqreturn_t ipr_isr_mhrrq(int irq, void *devp)
        if (ioa_cfg->iopoll_weight && ioa_cfg->sis64 && ioa_cfg->nvectors > 1) {
                if ((be32_to_cpu(*hrrq->hrrq_curr) & IPR_HRRQ_TOGGLE_BIT) ==
                       hrrq->toggle_bit) {
-                       if (!blk_iopoll_sched_prep(&hrrq->iopoll))
-                               blk_iopoll_sched(&hrrq->iopoll);
+                       irq_poll_sched(&hrrq->iopoll);
                        spin_unlock_irqrestore(hrrq->lock, hrrq_flags);
                        return IRQ_HANDLED;
                }
@@ -10405,9 +10402,8 @@ static int ipr_probe(struct pci_dev *pdev, const struct pci_device_id *dev_id)
 
        if (ioa_cfg->iopoll_weight && ioa_cfg->sis64 && ioa_cfg->nvectors > 1) {
                for (i = 1; i < ioa_cfg->hrrq_num; i++) {
-                       blk_iopoll_init(&ioa_cfg->hrrq[i].iopoll,
+                       irq_poll_init(&ioa_cfg->hrrq[i].iopoll,
                                        ioa_cfg->iopoll_weight, ipr_iopoll);
-                       blk_iopoll_enable(&ioa_cfg->hrrq[i].iopoll);
                }
        }
 
@@ -10436,7 +10432,7 @@ static void ipr_shutdown(struct pci_dev *pdev)
        if (ioa_cfg->iopoll_weight && ioa_cfg->sis64 && ioa_cfg->nvectors > 1) {
                ioa_cfg->iopoll_weight = 0;
                for (i = 1; i < ioa_cfg->hrrq_num; i++)
-                       blk_iopoll_disable(&ioa_cfg->hrrq[i].iopoll);
+                       irq_poll_disable(&ioa_cfg->hrrq[i].iopoll);
        }
 
        while (ioa_cfg->in_reset_reload) {
index a34c7a5a995e4bcbf61b1d859fb784b60a9cdbee..56c57068300a1fb947221f8bc32f467aa115ae80 100644 (file)
@@ -32,7 +32,7 @@
 #include <linux/libata.h>
 #include <linux/list.h>
 #include <linux/kref.h>
-#include <linux/blk-iopoll.h>
+#include <linux/irq_poll.h>
 #include <scsi/scsi.h>
 #include <scsi/scsi_cmnd.h>
 
@@ -517,7 +517,7 @@ struct ipr_hrr_queue {
        u8 allow_cmds:1;
        u8 removing_ioa:1;
 
-       struct blk_iopoll iopoll;
+       struct irq_poll iopoll;
 };
 
 /* Command packet structure */
index d64a769b8155ccda7ab0d0368c9bb84eb717e6a6..bb2381314a2bfe91c1552f4f257dc41919014aac 100644 (file)
@@ -12,7 +12,6 @@
  */
 
 #include <linux/types.h>
-#include <linux/delay.h>
 #include <linux/module.h>
 #include <linux/ioport.h>
 #include <linux/init.h>
 #define PSEUDO_DMA
 
 #define NCR5380_implementation_fields   unsigned char *pdma_base
-#define NCR5380_local_declare()         struct Scsi_Host *_instance
-#define NCR5380_setup(instance)         _instance = instance
 
-#define NCR5380_read(reg)               macscsi_read(_instance, reg)
-#define NCR5380_write(reg, value)       macscsi_write(_instance, reg, value)
+#define NCR5380_read(reg)               macscsi_read(instance, reg)
+#define NCR5380_write(reg, value)       macscsi_write(instance, reg, value)
 
 #define NCR5380_pread                   macscsi_pread
 #define NCR5380_pwrite                  macscsi_pwrite
+#define NCR5380_dma_xfer_len(instance, cmd, phase)     (cmd->transfersize)
 
 #define NCR5380_intr                    macscsi_intr
 #define NCR5380_queue_command           macscsi_queue_command
@@ -51,8 +49,6 @@
 
 #include "NCR5380.h"
 
-#define RESET_BOOT
-
 static int setup_can_queue = -1;
 module_param(setup_can_queue, int, 0);
 static int setup_cmd_per_lun = -1;
@@ -65,17 +61,8 @@ static int setup_use_tagged_queuing = -1;
 module_param(setup_use_tagged_queuing, int, 0);
 static int setup_hostid = -1;
 module_param(setup_hostid, int, 0);
-
-/* Time (in jiffies) to wait after a reset; the SCSI standard calls for 250ms,
- * we usually do 0.5s to be on the safe side. But Toshiba CD-ROMs once more
- * need ten times the standard value... */
-#define TOSHIBA_DELAY
-
-#ifdef TOSHIBA_DELAY
-#define        AFTER_RESET_DELAY       (5*HZ/2)
-#else
-#define        AFTER_RESET_DELAY       (HZ/2)
-#endif
+static int setup_toshiba_delay = -1;
+module_param(setup_toshiba_delay, int, 0);
 
 /*
  * NCR 5380 register access functions
@@ -94,12 +81,12 @@ static inline void macscsi_write(struct Scsi_Host *instance, int reg, int value)
 #ifndef MODULE
 static int __init mac_scsi_setup(char *str)
 {
-       int ints[7];
+       int ints[8];
 
        (void)get_options(str, ARRAY_SIZE(ints), ints);
 
-       if (ints[0] < 1 || ints[0] > 6) {
-               pr_err("Usage: mac5380=<can_queue>[,<cmd_per_lun>[,<sg_tablesize>[,<hostid>[,<use_tags>[,<use_pdma>]]]]]\n");
+       if (ints[0] < 1) {
+               pr_err("Usage: mac5380=<can_queue>[,<cmd_per_lun>[,<sg_tablesize>[,<hostid>[,<use_tags>[,<use_pdma>[,<toshiba_delay>]]]]]]\n");
                return 0;
        }
        if (ints[0] >= 1)
@@ -114,50 +101,14 @@ static int __init mac_scsi_setup(char *str)
                setup_use_tagged_queuing = ints[5];
        if (ints[0] >= 6)
                setup_use_pdma = ints[6];
+       if (ints[0] >= 7)
+               setup_toshiba_delay = ints[7];
        return 1;
 }
 
 __setup("mac5380=", mac_scsi_setup);
 #endif /* !MODULE */
 
-#ifdef RESET_BOOT
-/*
- * Our 'bus reset on boot' function
- */
-
-static void mac_scsi_reset_boot(struct Scsi_Host *instance)
-{
-       unsigned long end;
-
-       NCR5380_local_declare();
-       NCR5380_setup(instance);
-       
-       /*
-        * Do a SCSI reset to clean up the bus during initialization. No messing
-        * with the queues, interrupts, or locks necessary here.
-        */
-
-       printk(KERN_INFO "Macintosh SCSI: resetting the SCSI bus..." );
-
-       /* get in phase */
-       NCR5380_write( TARGET_COMMAND_REG,
-                     PHASE_SR_TO_TCR( NCR5380_read(STATUS_REG) ));
-
-       /* assert RST */
-       NCR5380_write( INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_RST );
-       /* The min. reset hold time is 25us, so 40us should be enough */
-       udelay( 50 );
-       /* reset RST and interrupt */
-       NCR5380_write( INITIATOR_COMMAND_REG, ICR_BASE );
-       NCR5380_read( RESET_PARITY_INTERRUPT_REG );
-
-       for( end = jiffies + AFTER_RESET_DELAY; time_before(jiffies, end); )
-               barrier();
-
-       printk(KERN_INFO " done\n" );
-}
-#endif
-
 #ifdef PSEUDO_DMA
 /* 
    Pseudo-DMA: (Ove Edlund)
@@ -235,9 +186,6 @@ static int macscsi_pread(struct Scsi_Host *instance,
        unsigned char *d;
        unsigned char *s;
 
-       NCR5380_local_declare();
-       NCR5380_setup(instance);
-
        s = hostdata->pdma_base + (INPUT_DATA_REG << 4);
        d = dst;
 
@@ -329,9 +277,6 @@ static int macscsi_pwrite(struct Scsi_Host *instance,
        unsigned char *s;
        unsigned char *d;
 
-       NCR5380_local_declare();
-       NCR5380_setup(instance);
-
        s = src;
        d = hostdata->pdma_base + (OUTPUT_DATA_REG << 4);
 
@@ -364,20 +309,22 @@ static int macscsi_pwrite(struct Scsi_Host *instance,
 #define PFX                     DRV_MODULE_NAME ": "
 
 static struct scsi_host_template mac_scsi_template = {
-       .module                         = THIS_MODULE,
-       .proc_name                      = DRV_MODULE_NAME,
-       .show_info                      = macscsi_show_info,
-       .write_info                     = macscsi_write_info,
-       .name                           = "Macintosh NCR5380 SCSI",
-       .info                           = macscsi_info,
-       .queuecommand                   = macscsi_queue_command,
-       .eh_abort_handler               = macscsi_abort,
-       .eh_bus_reset_handler           = macscsi_bus_reset,
-       .can_queue                      = 16,
-       .this_id                        = 7,
-       .sg_tablesize                   = SG_ALL,
-       .cmd_per_lun                    = 2,
-       .use_clustering                 = DISABLE_CLUSTERING
+       .module                 = THIS_MODULE,
+       .proc_name              = DRV_MODULE_NAME,
+       .show_info              = macscsi_show_info,
+       .write_info             = macscsi_write_info,
+       .name                   = "Macintosh NCR5380 SCSI",
+       .info                   = macscsi_info,
+       .queuecommand           = macscsi_queue_command,
+       .eh_abort_handler       = macscsi_abort,
+       .eh_bus_reset_handler   = macscsi_bus_reset,
+       .can_queue              = 16,
+       .this_id                = 7,
+       .sg_tablesize           = SG_ALL,
+       .cmd_per_lun            = 2,
+       .use_clustering         = DISABLE_CLUSTERING,
+       .cmd_size               = NCR5380_CMD_SIZE,
+       .max_sectors            = 128,
 };
 
 static int __init mac_scsi_probe(struct platform_device *pdev)
@@ -432,15 +379,14 @@ static int __init mac_scsi_probe(struct platform_device *pdev)
        } else
                host_flags |= FLAG_NO_PSEUDO_DMA;
 
-#ifdef RESET_BOOT
-       mac_scsi_reset_boot(instance);
-#endif
-
 #ifdef SUPPORT_TAGS
        host_flags |= setup_use_tagged_queuing > 0 ? FLAG_TAGGED_QUEUING : 0;
 #endif
+       host_flags |= setup_toshiba_delay > 0 ? FLAG_TOSHIBA_DELAY : 0;
 
-       NCR5380_init(instance, host_flags);
+       error = NCR5380_init(instance, host_flags);
+       if (error)
+               goto fail_init;
 
        if (instance->irq != NO_IRQ) {
                error = request_irq(instance->irq, macscsi_intr, IRQF_SHARED,
@@ -449,6 +395,8 @@ static int __init mac_scsi_probe(struct platform_device *pdev)
                        goto fail_irq;
        }
 
+       NCR5380_maybe_reset_bus(instance);
+
        error = scsi_add_host(instance, NULL);
        if (error)
                goto fail_host;
@@ -463,6 +411,7 @@ fail_host:
                free_irq(instance->irq, instance);
 fail_irq:
        NCR5380_exit(instance);
+fail_init:
        scsi_host_put(instance);
        return error;
 }
index a70692779a16c770300b410eda56a9f1823e9239..4cf9ed96414f05c881221dabcc18ca5717bd2c11 100644 (file)
@@ -179,8 +179,12 @@ mraid_mm_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
 
        /*
         * The following call will block till a kioc is available
+        * or return NULL if the list head is empty for the pointer
+        * of type mraid_mmapt passed to mraid_mm_alloc_kioc
         */
        kioc = mraid_mm_alloc_kioc(adp);
+       if (!kioc)
+               return -ENXIO;
 
        /*
         * User sent the old mimd_t ioctl packet. Convert it to uioc_t.
index e81eadd08afc776d6be23b2e2af8f550841da1b4..512037e27783d6192f1cea65b41eefb5fbaa9db4 100644 (file)
@@ -1,6 +1,4 @@
 #define PSEUDO_DMA
-#define UNSAFE  /* Not unsafe for PAS16 -- use it */
-#define PDEBUG 0
 
 /*
  * This driver adapted from Drew Eckhardt's Trantor T128 driver
  
 #include <linux/module.h>
 
-#include <linux/signal.h>
-#include <linux/proc_fs.h>
 #include <asm/io.h>
 #include <asm/dma.h>
 #include <linux/blkdev.h>
-#include <linux/delay.h>
 #include <linux/interrupt.h>
-#include <linux/stat.h>
 #include <linux/init.h>
 
 #include <scsi/scsi_host.h>
@@ -87,8 +81,8 @@
 #include "NCR5380.h"
 
 
-static unsigned short pas16_addr = 0;
-static int pas16_irq = 0;
+static unsigned short pas16_addr;
+static int pas16_irq;
  
 
 static const int scsi_irq_translate[] =
@@ -146,22 +140,6 @@ static const unsigned short  pas16_offset[ 8 ] =
                    * START_DMA_INITIATOR_RECEIVE_REG wo
                    */
     };
-/*----------------------------------------------------------------*/
-/* the following will set the monitor border color (useful to find
- where something crashed or gets stuck at */
-/* 1 = blue
- 2 = green
- 3 = cyan
- 4 = red
- 5 = magenta
- 6 = yellow
- 7 = white
-*/
-#if 1
-#define rtrc(i) {inb(0x3da); outb(0x31, 0x3c0); outb((i), 0x3c0);}
-#else
-#define rtrc(i) {}
-#endif
 
 
 /*
@@ -205,7 +183,7 @@ static void __init
        outb( 0x01, io_port + P_TIMEOUT_STATUS_REG_OFFSET );   /* Reset TC */
        outb( 0x01, io_port + WAIT_STATE );   /* 1 Wait state */
 
-       NCR5380_read( RESET_PARITY_INTERRUPT_REG );
+       inb(io_port + pas16_offset[RESET_PARITY_INTERRUPT_REG]);
 
        /* Set the SCSI interrupt pointer without mucking up the sound
         * interrupt pointer in the same byte.
@@ -280,13 +258,13 @@ static int __init
      * put in an additional test to try to weed them out.
      */
 
-    outb( 0x01, io_port + WAIT_STATE );        /* 1 Wait state */
-    NCR5380_write( MODE_REG, 0x20 );           /* Is it really SCSI? */
-    if( NCR5380_read( MODE_REG ) != 0x20 )     /* Write to a reg.    */
-       return 0;                               /* and try to read    */
-    NCR5380_write( MODE_REG, 0x00 );           /* it back.           */
-    if( NCR5380_read( MODE_REG ) != 0x00 )
-       return 0;
+       outb(0x01, io_port + WAIT_STATE);             /* 1 Wait state */
+       outb(0x20, io_port + pas16_offset[MODE_REG]); /* Is it really SCSI? */
+       if (inb(io_port + pas16_offset[MODE_REG]) != 0x20) /* Write to a reg. */
+               return 0;                                  /* and try to read */
+       outb(0x00, io_port + pas16_offset[MODE_REG]);      /* it back. */
+       if (inb(io_port + pas16_offset[MODE_REG]) != 0x00)
+               return 0;
 
     return 1;
 }
@@ -305,7 +283,7 @@ static int __init
 
 static int __init pas16_setup(char *str)
 {
-    static int commandline_current = 0;
+       static int commandline_current;
     int i;
     int ints[10];
 
@@ -344,8 +322,8 @@ __setup("pas16=", pas16_setup);
 
 static int __init pas16_detect(struct scsi_host_template *tpnt)
 {
-    static int current_override = 0;
-    static unsigned short current_base = 0;
+       static int current_override;
+       static unsigned short current_base;
     struct Scsi_Host *instance;
     unsigned short io_port;
     int  count;
@@ -377,34 +355,32 @@ static int __init pas16_detect(struct scsi_host_template *tpnt)
        }
        else
            for (; !io_port && (current_base < NO_BASES); ++current_base) {
-#if (PDEBUG & PDEBUG_INIT)
-    printk("scsi-pas16 : probing io_port %04x\n", (unsigned int) bases[current_base].io_port);
-#endif
+               dprintk(NDEBUG_INIT, "pas16: probing io_port 0x%04x\n",
+                       (unsigned int)bases[current_base].io_port);
                if ( !bases[current_base].noauto &&
                     pas16_hw_detect( current_base ) ){
                        io_port = bases[current_base].io_port;
                        init_board( io_port, default_irqs[ current_base ], 0 ); 
-#if (PDEBUG & PDEBUG_INIT)
-                       printk("scsi-pas16 : detected board.\n");
-#endif
+                       dprintk(NDEBUG_INIT, "pas16: detected board\n");
                }
     }
 
-
-#if defined(PDEBUG) && (PDEBUG & PDEBUG_INIT)
-       printk("scsi-pas16 : io_port = %04x\n", (unsigned int) io_port);
-#endif
+       dprintk(NDEBUG_INIT, "pas16: io_port = 0x%04x\n",
+               (unsigned int)io_port);
 
        if (!io_port)
            break;
 
        instance = scsi_register (tpnt, sizeof(struct NCR5380_hostdata));
        if(instance == NULL)
-               break;
+               goto out;
                
        instance->io_port = io_port;
 
-       NCR5380_init(instance, 0);
+       if (NCR5380_init(instance, 0))
+               goto out_unregister;
+
+       NCR5380_maybe_reset_bus(instance);
 
        if (overrides[current_override].irq != IRQ_AUTO)
            instance->irq = overrides[current_override].irq;
@@ -431,14 +407,18 @@ static int __init pas16_detect(struct scsi_host_template *tpnt)
            outb( (inb(io_port + IO_CONFIG_3) & 0x0f), io_port + IO_CONFIG_3 );
        }
 
-#if defined(PDEBUG) && (PDEBUG & PDEBUG_INIT)
-       printk("scsi%d : irq = %d\n", instance->host_no, instance->irq);
-#endif
+       dprintk(NDEBUG_INIT, "scsi%d : irq = %d\n",
+               instance->host_no, instance->irq);
 
        ++current_override;
        ++count;
     }
     return count;
+
+out_unregister:
+       scsi_unregister(instance);
+out:
+       return count;
 }
 
 /*
@@ -561,29 +541,29 @@ static int pas16_release(struct Scsi_Host *shost)
        if (shost->irq != NO_IRQ)
                free_irq(shost->irq, shost);
        NCR5380_exit(shost);
-       if (shost->io_port && shost->n_io_port)
-               release_region(shost->io_port, shost->n_io_port);
        scsi_unregister(shost);
        return 0;
 }
 
 static struct scsi_host_template driver_template = {
-       .name           = "Pro Audio Spectrum-16 SCSI",
-       .detect         = pas16_detect,
-       .release        = pas16_release,
-       .proc_name      = "pas16",
-       .show_info      = pas16_show_info,
-       .write_info     = pas16_write_info,
-       .info           = pas16_info,
-       .queuecommand   = pas16_queue_command,
-       .eh_abort_handler = pas16_abort,
-       .eh_bus_reset_handler = pas16_bus_reset,
-       .bios_param     = pas16_biosparam, 
-       .can_queue      = CAN_QUEUE,
-       .this_id        = 7,
-       .sg_tablesize   = SG_ALL,
-       .cmd_per_lun    = CMD_PER_LUN,
-       .use_clustering = DISABLE_CLUSTERING,
+       .name                   = "Pro Audio Spectrum-16 SCSI",
+       .detect                 = pas16_detect,
+       .release                = pas16_release,
+       .proc_name              = "pas16",
+       .show_info              = pas16_show_info,
+       .write_info             = pas16_write_info,
+       .info                   = pas16_info,
+       .queuecommand           = pas16_queue_command,
+       .eh_abort_handler       = pas16_abort,
+       .eh_bus_reset_handler   = pas16_bus_reset,
+       .bios_param             = pas16_biosparam,
+       .can_queue              = 32,
+       .this_id                = 7,
+       .sg_tablesize           = SG_ALL,
+       .cmd_per_lun            = 2,
+       .use_clustering         = DISABLE_CLUSTERING,
+       .cmd_size               = NCR5380_CMD_SIZE,
+       .max_sectors            = 128,
 };
 #include "scsi_module.c"
 
index c6109c80050bdf59d2b6933ba43579f8afa896ed..d375277172258391ed84ff1627c060f007e1f188 100644 (file)
@@ -24,9 +24,6 @@
 #ifndef PAS16_H
 #define PAS16_H
 
-#define PDEBUG_INIT    0x1
-#define PDEBUG_TRANSFER 0x2
-
 #define PAS16_DEFAULT_BASE_1  0x388
 #define PAS16_DEFAULT_BASE_2  0x384
 #define PAS16_DEFAULT_BASE_3  0x38c
 #define OPERATION_MODE_1 0xec03
 #define IO_CONFIG_3 0xf002
 
+#define NCR5380_implementation_fields /* none */
 
-#ifndef ASM
-
-#ifndef CMD_PER_LUN
-#define CMD_PER_LUN 2
-#endif
-
-#ifndef CAN_QUEUE
-#define CAN_QUEUE 32 
-#endif
-
-#define NCR5380_implementation_fields \
-    volatile unsigned short io_port
-
-#define NCR5380_local_declare() \
-    volatile unsigned short io_port
+#define PAS16_io_port(reg) (instance->io_port + pas16_offset[(reg)])
 
-#define NCR5380_setup(instance) \
-    io_port = (instance)->io_port
-
-#define PAS16_io_port(reg) ( io_port + pas16_offset[(reg)] )
-
-#if !(PDEBUG & PDEBUG_TRANSFER) 
 #define NCR5380_read(reg) ( inb(PAS16_io_port(reg)) )
 #define NCR5380_write(reg, value) ( outb((value),PAS16_io_port(reg)) )
-#else
-#define NCR5380_read(reg)                                              \
-    (((unsigned char) printk("scsi%d : read register %d at io_port %04x\n"\
-    , instance->hostno, (reg), PAS16_io_port(reg))), inb( PAS16_io_port(reg)) )
-
-#define NCR5380_write(reg, value)                                      \
-    (printk("scsi%d : write %02x to register %d at io_port %04x\n",    \
-           instance->hostno, (value), (reg), PAS16_io_port(reg)),      \
-    outb( (value),PAS16_io_port(reg) ) )
-
-#endif
 
+#define NCR5380_dma_xfer_len(instance, cmd, phase)     (cmd->transfersize)
 
 #define NCR5380_intr pas16_intr
-#define do_NCR5380_intr do_pas16_intr
 #define NCR5380_queue_command pas16_queue_command
 #define NCR5380_abort pas16_abort
 #define NCR5380_bus_reset pas16_bus_reset
    
 #define PAS16_IRQS 0xd4a8 
 
-#endif /* ndef ASM */
 #endif /* PAS16_H */
index 2c1160c7ec92fdd48c078c4862a2dac7904edc31..47b9d13f97b880033c20144968cf715c8af41623 100644 (file)
@@ -227,6 +227,7 @@ static struct {
        {"Promise", "VTrak E610f", NULL, BLIST_SPARSELUN | BLIST_NO_RSOC},
        {"Promise", "", NULL, BLIST_SPARSELUN},
        {"QNAP", "iSCSI Storage", NULL, BLIST_MAX_1024},
+       {"SYNOLOGY", "iSCSI Storage", NULL, BLIST_MAX_1024},
        {"QUANTUM", "XP34301", "1071", BLIST_NOTQ},
        {"REGAL", "CDC-4X", NULL, BLIST_MAX5LUN | BLIST_SINGLELUN},
        {"SanDisk", "ImageMate CF-SD1", NULL, BLIST_FORCELUN},
index 41c115c230d9bf66da94693597967d037fdefaf6..55627d097873a85780a878e32b32c20e6c4a9524 100644 (file)
@@ -390,7 +390,7 @@ module_param(storvsc_ringbuffer_size, int, S_IRUGO);
 MODULE_PARM_DESC(storvsc_ringbuffer_size, "Ring buffer size (bytes)");
 
 module_param(storvsc_vcpus_per_sub_channel, int, S_IRUGO);
-MODULE_PARM_DESC(vcpus_per_sub_channel, "Ratio of VCPUs to subchannels");
+MODULE_PARM_DESC(storvsc_vcpus_per_sub_channel, "Ratio of VCPUs to subchannels");
 /*
  * Timeout in seconds for all devices managed by this driver.
  */
index 22a42836d193a3723c4cb1193a0c72aeb9d99ffe..b9de487bbd311d4aeec4e215cf698c959f4b1db6 100644 (file)
 #define NCR5380_queue_command           sun3scsi_queue_command
 #define NCR5380_bus_reset               sun3scsi_bus_reset
 #define NCR5380_abort                   sun3scsi_abort
-#define NCR5380_show_info               sun3scsi_show_info
 #define NCR5380_info                    sun3scsi_info
 
 #define NCR5380_dma_read_setup(instance, data, count) \
-        sun3scsi_dma_setup(data, count, 0)
+        sun3scsi_dma_setup(instance, data, count, 0)
 #define NCR5380_dma_write_setup(instance, data, count) \
-        sun3scsi_dma_setup(data, count, 1)
+        sun3scsi_dma_setup(instance, data, count, 1)
 #define NCR5380_dma_residual(instance) \
         sun3scsi_dma_residual(instance)
 #define NCR5380_dma_xfer_len(instance, cmd, phase) \
@@ -86,10 +85,6 @@ module_param(setup_use_tagged_queuing, int, 0);
 static int setup_hostid = -1;
 module_param(setup_hostid, int, 0);
 
-/* #define RESET_BOOT */
-
-#define        AFTER_RESET_DELAY       (HZ/2)
-
 /* ms to wait after hitting dma regs */
 #define SUN3_DMA_DELAY 10
 
@@ -100,11 +95,10 @@ static struct scsi_cmnd *sun3_dma_setup_done;
 static unsigned char *sun3_scsi_regp;
 static volatile struct sun3_dma_regs *dregs;
 static struct sun3_udc_regs *udc_regs;
-static unsigned char *sun3_dma_orig_addr = NULL;
-static unsigned long sun3_dma_orig_count = 0;
-static int sun3_dma_active = 0;
-static unsigned long last_residual = 0;
-static struct Scsi_Host *default_instance;
+static unsigned char *sun3_dma_orig_addr;
+static unsigned long sun3_dma_orig_count;
+static int sun3_dma_active;
+static unsigned long last_residual;
 
 /*
  * NCR 5380 register access functions
@@ -144,50 +138,12 @@ static inline void sun3_udc_write(unsigned short val, unsigned char reg)
 }
 #endif
 
-#ifdef RESET_BOOT
-static void sun3_scsi_reset_boot(struct Scsi_Host *instance)
-{
-       unsigned long end;
-       
-       /*
-        * Do a SCSI reset to clean up the bus during initialization. No
-        * messing with the queues, interrupts, or locks necessary here.
-        */
-
-       printk( "Sun3 SCSI: resetting the SCSI bus..." );
-
-       /* switch off SCSI IRQ - catch an interrupt without IRQ bit set else */
-//             sun3_disable_irq( IRQ_SUN3_SCSI );
-
-       /* get in phase */
-       NCR5380_write( TARGET_COMMAND_REG,
-                     PHASE_SR_TO_TCR( NCR5380_read(STATUS_REG) ));
-
-       /* assert RST */
-       NCR5380_write( INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_RST );
-
-       /* The min. reset hold time is 25us, so 40us should be enough */
-       udelay( 50 );
-
-       /* reset RST and interrupt */
-       NCR5380_write( INITIATOR_COMMAND_REG, ICR_BASE );
-       NCR5380_read( RESET_PARITY_INTERRUPT_REG );
-
-       for( end = jiffies + AFTER_RESET_DELAY; time_before(jiffies, end); )
-               barrier();
-
-       /* switch on SCSI IRQ again */
-//             sun3_enable_irq( IRQ_SUN3_SCSI );
-
-       printk( " done\n" );
-}
-#endif
-
 // safe bits for the CSR
 #define CSR_GOOD 0x060f
 
-static irqreturn_t scsi_sun3_intr(int irq, void *dummy)
+static irqreturn_t scsi_sun3_intr(int irq, void *dev)
 {
+       struct Scsi_Host *instance = dev;
        unsigned short csr = dregs->csr;
        int handled = 0;
 
@@ -196,46 +152,24 @@ static irqreturn_t scsi_sun3_intr(int irq, void *dummy)
 #endif
 
        if(csr & ~CSR_GOOD) {
-               if(csr & CSR_DMA_BUSERR) {
-                       printk("scsi%d: bus error in dma\n", default_instance->host_no);
-               }
-
-               if(csr & CSR_DMA_CONFLICT) {
-                       printk("scsi%d: dma conflict\n", default_instance->host_no);
-               }
+               if (csr & CSR_DMA_BUSERR)
+                       shost_printk(KERN_ERR, instance, "bus error in DMA\n");
+               if (csr & CSR_DMA_CONFLICT)
+                       shost_printk(KERN_ERR, instance, "DMA conflict\n");
                handled = 1;
        }
 
        if(csr & (CSR_SDB_INT | CSR_DMA_INT)) {
-               NCR5380_intr(irq, dummy);
+               NCR5380_intr(irq, dev);
                handled = 1;
        }
 
        return IRQ_RETVAL(handled);
 }
 
-/*
- * Debug stuff - to be called on NMI, or sysrq key. Use at your own risk; 
- * reentering NCR5380_print_status seems to have ugly side effects
- */
-
-/* this doesn't seem to get used at all -- sam */
-#if 0
-void sun3_sun3_debug (void)
-{
-       unsigned long flags;
-
-       if (default_instance) {
-                       local_irq_save(flags);
-                       NCR5380_print_status(default_instance);
-                       local_irq_restore(flags);
-       }
-}
-#endif
-
-
 /* sun3scsi_dma_setup() -- initialize the dma controller for a read/write */
-static unsigned long sun3scsi_dma_setup(void *data, unsigned long count, int write_flag)
+static unsigned long sun3scsi_dma_setup(struct Scsi_Host *instance,
+                                void *data, unsigned long count, int write_flag)
 {
        void *addr;
 
@@ -287,10 +221,9 @@ static unsigned long sun3scsi_dma_setup(void *data, unsigned long count, int wri
        dregs->csr |= CSR_FIFO;
        
        if(dregs->fifo_count != count) { 
-               printk("scsi%d: fifo_mismatch %04x not %04x\n",
-                      default_instance->host_no, dregs->fifo_count,
-                      (unsigned int) count);
-               NCR5380_dprint(NDEBUG_DMA, default_instance);
+               shost_printk(KERN_ERR, instance, "FIFO mismatch %04x not %04x\n",
+                            dregs->fifo_count, (unsigned int) count);
+               NCR5380_dprint(NDEBUG_DMA, instance);
        }
 
        /* setup udc */
@@ -325,21 +258,6 @@ static unsigned long sun3scsi_dma_setup(void *data, unsigned long count, int wri
 
 }
 
-#ifndef SUN3_SCSI_VME
-static inline unsigned long sun3scsi_dma_count(struct Scsi_Host *instance)
-{
-       unsigned short resid;
-
-       dregs->udc_addr = 0x32; 
-       udelay(SUN3_DMA_DELAY);
-       resid = dregs->udc_data;
-       udelay(SUN3_DMA_DELAY);
-       resid *= 2;
-
-       return (unsigned long) resid;
-}
-#endif
-
 static inline unsigned long sun3scsi_dma_residual(struct Scsi_Host *instance)
 {
        return last_residual;
@@ -437,7 +355,10 @@ static int sun3scsi_dma_finish(int write_flag)
                }
        }
 
-       count = sun3scsi_dma_count(default_instance);
+       dregs->udc_addr = 0x32;
+       udelay(SUN3_DMA_DELAY);
+       count = 2 * dregs->udc_data;
+       udelay(SUN3_DMA_DELAY);
 
        fifo = dregs->fifo_count;
        last_residual = fifo;
@@ -502,17 +423,17 @@ static int sun3scsi_dma_finish(int write_flag)
 static struct scsi_host_template sun3_scsi_template = {
        .module                 = THIS_MODULE,
        .proc_name              = DRV_MODULE_NAME,
-       .show_info              = sun3scsi_show_info,
        .name                   = SUN3_SCSI_NAME,
        .info                   = sun3scsi_info,
        .queuecommand           = sun3scsi_queue_command,
-       .eh_abort_handler       = sun3scsi_abort,
-       .eh_bus_reset_handler   = sun3scsi_bus_reset,
+       .eh_abort_handler       = sun3scsi_abort,
+       .eh_bus_reset_handler   = sun3scsi_bus_reset,
        .can_queue              = 16,
        .this_id                = 7,
        .sg_tablesize           = SG_NONE,
        .cmd_per_lun            = 2,
-       .use_clustering         = DISABLE_CLUSTERING
+       .use_clustering         = DISABLE_CLUSTERING,
+       .cmd_size               = NCR5380_CMD_SIZE,
 };
 
 static int __init sun3_scsi_probe(struct platform_device *pdev)
@@ -591,7 +512,6 @@ static int __init sun3_scsi_probe(struct platform_device *pdev)
                error = -ENOMEM;
                goto fail_alloc;
        }
-       default_instance = instance;
 
        instance->io_port = (unsigned long)ioaddr;
        instance->irq = irq->start;
@@ -600,7 +520,9 @@ static int __init sun3_scsi_probe(struct platform_device *pdev)
        host_flags |= setup_use_tagged_queuing > 0 ? FLAG_TAGGED_QUEUING : 0;
 #endif
 
-       NCR5380_init(instance, host_flags);
+       error = NCR5380_init(instance, host_flags);
+       if (error)
+               goto fail_init;
 
        error = request_irq(instance->irq, scsi_sun3_intr, 0,
                            "NCR5380", instance);
@@ -631,9 +553,7 @@ static int __init sun3_scsi_probe(struct platform_device *pdev)
        dregs->ivect = VME_DATA24 | (instance->irq & 0xff);
 #endif
 
-#ifdef RESET_BOOT
-       sun3_scsi_reset_boot(instance);
-#endif
+       NCR5380_maybe_reset_bus(instance);
 
        error = scsi_add_host(instance, NULL);
        if (error)
@@ -649,6 +569,7 @@ fail_host:
                free_irq(instance->irq, instance);
 fail_irq:
        NCR5380_exit(instance);
+fail_init:
        scsi_host_put(instance);
 fail_alloc:
        if (udc_regs)
index 87828acbf7c63601d618909a59028a3b4ffa6372..4615fda60dbdf6477ff28566740f59f6f51975ea 100644 (file)
  * 15 9-11
  */
  
-#include <linux/signal.h>
 #include <linux/io.h>
 #include <linux/blkdev.h>
 #include <linux/interrupt.h>
-#include <linux/stat.h>
 #include <linux/init.h>
 #include <linux/module.h>
-#include <linux/delay.h>
 
 #include <scsi/scsi_host.h>
 #include "t128.h"
@@ -126,7 +123,7 @@ static struct signature {
 
 static int __init t128_setup(char *str)
 {
-    static int commandline_current = 0;
+       static int commandline_current;
     int i;
     int ints[10];
 
@@ -165,7 +162,7 @@ __setup("t128=", t128_setup);
 
 static int __init t128_detect(struct scsi_host_template *tpnt)
 {
-    static int current_override = 0, current_base = 0;
+       static int current_override, current_base;
     struct Scsi_Host *instance;
     unsigned long base;
     void __iomem *p;
@@ -182,9 +179,8 @@ static int __init t128_detect(struct scsi_host_template *tpnt)
                base = 0;
        } else 
            for (; !base && (current_base < NO_BASES); ++current_base) {
-#if (TDEBUG & TDEBUG_INIT)
-    printk("scsi-t128 : probing address %08x\n", bases[current_base].address);
-#endif
+               dprintk(NDEBUG_INIT, "t128: probing address 0x%08x\n",
+                       bases[current_base].address);
                if (bases[current_base].noauto)
                        continue;
                p = ioremap(bases[current_base].address, 0x2000);
@@ -195,17 +191,13 @@ static int __init t128_detect(struct scsi_host_template *tpnt)
                                        signatures[sig].string,
                                        strlen(signatures[sig].string))) {
                        base = bases[current_base].address;
-#if (TDEBUG & TDEBUG_INIT)
-                       printk("scsi-t128 : detected board.\n");
-#endif
+                       dprintk(NDEBUG_INIT, "t128: detected board\n");
                        goto found;
                    }
                iounmap(p);
            }
 
-#if defined(TDEBUG) && (TDEBUG & TDEBUG_INIT)
-       printk("scsi-t128 : base = %08x\n", (unsigned int) base);
-#endif
+       dprintk(NDEBUG_INIT, "t128: base = 0x%08x\n", (unsigned int)base);
 
        if (!base)
            break;
@@ -213,12 +205,15 @@ static int __init t128_detect(struct scsi_host_template *tpnt)
 found:
        instance = scsi_register (tpnt, sizeof(struct NCR5380_hostdata));
        if(instance == NULL)
-               break;
-               
+               goto out_unmap;
+
        instance->base = base;
        ((struct NCR5380_hostdata *)instance->hostdata)->base = p;
 
-       NCR5380_init(instance, 0);
+       if (NCR5380_init(instance, 0))
+               goto out_unregister;
+
+       NCR5380_maybe_reset_bus(instance);
 
        if (overrides[current_override].irq != IRQ_AUTO)
            instance->irq = overrides[current_override].irq;
@@ -242,27 +237,30 @@ found:
            printk("scsi%d : please jumper the board for a free IRQ.\n", instance->host_no);
        }
 
-#if defined(TDEBUG) && (TDEBUG & TDEBUG_INIT)
-       printk("scsi%d : irq = %d\n", instance->host_no, instance->irq);
-#endif
+       dprintk(NDEBUG_INIT, "scsi%d: irq = %d\n",
+               instance->host_no, instance->irq);
 
        ++current_override;
        ++count;
     }
     return count;
+
+out_unregister:
+       scsi_unregister(instance);
+out_unmap:
+       iounmap(p);
+       return count;
 }
 
 static int t128_release(struct Scsi_Host *shost)
 {
-       NCR5380_local_declare();
-       NCR5380_setup(shost);
+       struct NCR5380_hostdata *hostdata = shost_priv(shost);
+
        if (shost->irq != NO_IRQ)
                free_irq(shost->irq, shost);
        NCR5380_exit(shost);
-       if (shost->io_port && shost->n_io_port)
-               release_region(shost->io_port, shost->n_io_port);
        scsi_unregister(shost);
-       iounmap(base);
+       iounmap(hostdata->base);
        return 0;
 }
 
@@ -308,14 +306,14 @@ static int t128_biosparam(struct scsi_device *sdev, struct block_device *bdev,
  *     timeout.
  */
 
-static inline int NCR5380_pread (struct Scsi_Host *instance, unsigned char *dst,
-    int len) {
-    NCR5380_local_declare();
-    void __iomem *reg;
+static inline int
+NCR5380_pread(struct Scsi_Host *instance, unsigned char *dst, int len)
+{
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
+       void __iomem *reg, *base = hostdata->base;
     unsigned char *d = dst;
     register int i = len;
 
-    NCR5380_setup(instance);
     reg = base + T_DATA_REG_OFFSET;
 
 #if 0
@@ -354,14 +352,14 @@ static inline int NCR5380_pread (struct Scsi_Host *instance, unsigned char *dst,
  *     timeout.
  */
 
-static inline int NCR5380_pwrite (struct Scsi_Host *instance, unsigned char *src,
-    int len) {
-    NCR5380_local_declare();
-    void __iomem *reg;
+static inline int
+NCR5380_pwrite(struct Scsi_Host *instance, unsigned char *src, int len)
+{
+       struct NCR5380_hostdata *hostdata = shost_priv(instance);
+       void __iomem *reg, *base = hostdata->base;
     unsigned char *s = src;
     register int i = len;
 
-    NCR5380_setup(instance);
     reg = base + T_DATA_REG_OFFSET;
 
 #if 0
@@ -392,21 +390,23 @@ MODULE_LICENSE("GPL");
 #include "NCR5380.c"
 
 static struct scsi_host_template driver_template = {
-       .name           = "Trantor T128/T128F/T228",
-       .detect         = t128_detect,
-       .release        = t128_release,
-       .proc_name      = "t128",
-       .show_info      = t128_show_info,
-       .write_info     = t128_write_info,
-       .info           = t128_info,
-       .queuecommand   = t128_queue_command,
-       .eh_abort_handler = t128_abort,
-       .eh_bus_reset_handler    = t128_bus_reset,
-       .bios_param     = t128_biosparam,
-       .can_queue      = CAN_QUEUE,
-        .this_id        = 7,
-       .sg_tablesize   = SG_ALL,
-       .cmd_per_lun    = CMD_PER_LUN,
-       .use_clustering = DISABLE_CLUSTERING,
+       .name                   = "Trantor T128/T128F/T228",
+       .detect                 = t128_detect,
+       .release                = t128_release,
+       .proc_name              = "t128",
+       .show_info              = t128_show_info,
+       .write_info             = t128_write_info,
+       .info                   = t128_info,
+       .queuecommand           = t128_queue_command,
+       .eh_abort_handler       = t128_abort,
+       .eh_bus_reset_handler   = t128_bus_reset,
+       .bios_param             = t128_biosparam,
+       .can_queue              = 32,
+       .this_id                = 7,
+       .sg_tablesize           = SG_ALL,
+       .cmd_per_lun            = 2,
+       .use_clustering         = DISABLE_CLUSTERING,
+       .cmd_size               = NCR5380_CMD_SIZE,
+       .max_sectors            = 128,
 };
 #include "scsi_module.c"
index 2c7371454dfd40cf5d398f64c9072d5f11f9e03d..dd16d85497e168680a5d3467f24baf0cbfe1e05a 100644 (file)
 #ifndef T128_H
 #define T128_H
 
-#define TDEBUG         0
-#define TDEBUG_INIT    0x1
-#define TDEBUG_TRANSFER 0x2
-
 /*
  * The trantor boards are memory mapped. They use an NCR5380 or
  * equivalent (my sample board had part second sourced from ZILOG).
 
 #define T_DATA_REG_OFFSET      0x1e00  /* rw 512 bytes long */
 
-#ifndef ASM
-
-#ifndef CMD_PER_LUN
-#define CMD_PER_LUN 2
-#endif
-
-#ifndef CAN_QUEUE
-#define CAN_QUEUE 32
-#endif
-
 #define NCR5380_implementation_fields \
     void __iomem *base
 
-#define NCR5380_local_declare() \
-    void __iomem *base
-
-#define NCR5380_setup(instance) \
-    base = ((struct NCR5380_hostdata *)(instance->hostdata))->base
+#define T128_address(reg) \
+       (((struct NCR5380_hostdata *)shost_priv(instance))->base + T_5380_OFFSET + ((reg) * 0x20))
 
-#define T128_address(reg) (base + T_5380_OFFSET + ((reg) * 0x20))
-
-#if !(TDEBUG & TDEBUG_TRANSFER)
 #define NCR5380_read(reg) readb(T128_address(reg))
 #define NCR5380_write(reg, value) writeb((value),(T128_address(reg)))
-#else
-#define NCR5380_read(reg)                                              \
-    (((unsigned char) printk("scsi%d : read register %d at address %08x\n"\
-    , instance->hostno, (reg), T128_address(reg))), readb(T128_address(reg)))
-
-#define NCR5380_write(reg, value) {                                    \
-    printk("scsi%d : write %02x to register %d at address %08x\n",     \
-           instance->hostno, (value), (reg), T128_address(reg));       \
-    writeb((value), (T128_address(reg)));                              \
-}
-#endif
+
+#define NCR5380_dma_xfer_len(instance, cmd, phase)     (cmd->transfersize)
 
 #define NCR5380_intr t128_intr
-#define do_NCR5380_intr do_t128_intr
 #define NCR5380_queue_command t128_queue_command
 #define NCR5380_abort t128_abort
 #define NCR5380_bus_reset t128_bus_reset
 
 #define T128_IRQS 0xc4a8
 
-#endif /* ndef ASM */
 #endif /* T128_H */
index fb2b3935b658f5bae0286b8153bafe7753ef6c8c..88260205a2614c84e8293bf82ca47c49abb2e29d 100644 (file)
@@ -7,6 +7,7 @@ source "drivers/soc/mediatek/Kconfig"
 source "drivers/soc/qcom/Kconfig"
 source "drivers/soc/rockchip/Kconfig"
 source "drivers/soc/sunxi/Kconfig"
+source "drivers/soc/tegra/Kconfig"
 source "drivers/soc/ti/Kconfig"
 source "drivers/soc/versatile/Kconfig"
 
index 0ad66fa9bb1aa47c97a7a5c1cae80f7bbc945473..5548a31e1a39a100142b45841cbe38e1aa007e38 100644 (file)
@@ -288,7 +288,7 @@ static struct spm_driver_data *spm_get_drv(struct platform_device *pdev,
        struct spm_driver_data *drv = NULL;
        struct device_node *cpu_node, *saw_node;
        int cpu;
-       bool found;
+       bool found = 0;
 
        for_each_possible_cpu(cpu) {
                cpu_node = of_cpu_device_node_get(cpu);
diff --git a/drivers/soc/tegra/Kconfig b/drivers/soc/tegra/Kconfig
new file mode 100644 (file)
index 0000000..d0c3c3e
--- /dev/null
@@ -0,0 +1,83 @@
+if ARCH_TEGRA
+
+# 32-bit ARM SoCs
+if ARM
+
+config ARCH_TEGRA_2x_SOC
+       bool "Enable support for Tegra20 family"
+       select ARCH_NEEDS_CPU_IDLE_COUPLED if SMP
+       select ARM_ERRATA_720789
+       select ARM_ERRATA_754327 if SMP
+       select ARM_ERRATA_764369 if SMP
+       select PINCTRL_TEGRA20
+       select PL310_ERRATA_727915 if CACHE_L2X0
+       select PL310_ERRATA_769419 if CACHE_L2X0
+       select TEGRA_TIMER
+       help
+         Support for NVIDIA Tegra AP20 and T20 processors, based on the
+         ARM CortexA9MP CPU and the ARM PL310 L2 cache controller
+
+config ARCH_TEGRA_3x_SOC
+       bool "Enable support for Tegra30 family"
+       select ARM_ERRATA_754322
+       select ARM_ERRATA_764369 if SMP
+       select PINCTRL_TEGRA30
+       select PL310_ERRATA_769419 if CACHE_L2X0
+       select TEGRA_TIMER
+       help
+         Support for NVIDIA Tegra T30 processor family, based on the
+         ARM CortexA9MP CPU and the ARM PL310 L2 cache controller
+
+config ARCH_TEGRA_114_SOC
+       bool "Enable support for Tegra114 family"
+       select ARM_ERRATA_798181 if SMP
+       select ARM_L1_CACHE_SHIFT_6
+       select HAVE_ARM_ARCH_TIMER
+       select PINCTRL_TEGRA114
+       select TEGRA_TIMER
+       help
+         Support for NVIDIA Tegra T114 processor family, based on the
+         ARM CortexA15MP CPU
+
+config ARCH_TEGRA_124_SOC
+       bool "Enable support for Tegra124 family"
+       select ARM_L1_CACHE_SHIFT_6
+       select HAVE_ARM_ARCH_TIMER
+       select PINCTRL_TEGRA124
+       select TEGRA_TIMER
+       help
+         Support for NVIDIA Tegra T124 processor family, based on the
+         ARM CortexA15MP CPU
+
+endif
+
+# 64-bit ARM SoCs
+if ARM64
+
+config ARCH_TEGRA_132_SOC
+       bool "NVIDIA Tegra132 SoC"
+       select PINCTRL_TEGRA124
+       help
+         Enable support for NVIDIA Tegra132 SoC, based on the Denver
+         ARMv8 CPU.  The Tegra132 SoC is similar to the Tegra124 SoC,
+         but contains an NVIDIA Denver CPU complex in place of
+         Tegra124's "4+1" Cortex-A15 CPU complex.
+
+config ARCH_TEGRA_210_SOC
+       bool "NVIDIA Tegra210 SoC"
+       select PINCTRL_TEGRA210
+       help
+         Enable support for the NVIDIA Tegra210 SoC. Also known as Tegra X1,
+         the Tegra210 has four Cortex-A57 cores paired with four Cortex-A53
+         cores in a switched configuration. It features a GPU of the Maxwell
+         architecture with support for DX11, SM4, OpenGL 4.5, OpenGL ES 3.1
+         and providing 256 CUDA cores. It supports hardware-accelerated en-
+         and decoding of various video standards including H.265, H.264 and
+         VP8 at 4K resolution and up to 60 fps.
+
+         Besides the multimedia features it also comes with a variety of I/O
+         controllers, such as GPIO, I2C, SPI, SDHCI, PCIe, SATA and XHCI, to
+         name only a few.
+
+endif
+endif
index d6273e143324e196b4ef2dbd9af967f3516235fa..a80d993b882eb1b3a76cab8ac6ce5ec0814a2e9f 100644 (file)
@@ -151,16 +151,12 @@ do {                                                                          \
 
 #define LIBCFS_FREE(ptr, size)                                   \
 do {                                                               \
-       int s = (size);                                          \
        if (unlikely((ptr) == NULL)) {                            \
                CERROR("LIBCFS: free NULL '" #ptr "' (%d bytes) at "    \
-                      "%s:%d\n", s, __FILE__, __LINE__);              \
+                      "%s:%d\n", (int)(size), __FILE__, __LINE__);     \
                break;                                            \
        }                                                              \
-       if (unlikely(s > LIBCFS_VMALLOC_SIZE))                    \
-               vfree(ptr);                                 \
-       else                                                        \
-               kfree(ptr);                                       \
+       kvfree(ptr);                                      \
 } while (0)
 
 /******************************************************************************/
index 72af486b65df70ee2b011ee745bef74c33beec5c..cb74ae731b955d623b9d7cce3c3cb3441229365e 100644 (file)
@@ -2070,32 +2070,13 @@ static int kiblnd_net_init_pools(kib_net_t *net, __u32 *cpts, int ncpts)
 
 static int kiblnd_hdev_get_attr(kib_hca_dev_t *hdev)
 {
-       struct ib_device_attr *attr;
-       int rc;
-
        /* It's safe to assume a HCA can handle a page size
         * matching that of the native system */
        hdev->ibh_page_shift = PAGE_SHIFT;
        hdev->ibh_page_size  = 1 << PAGE_SHIFT;
        hdev->ibh_page_mask  = ~((__u64)hdev->ibh_page_size - 1);
 
-       LIBCFS_ALLOC(attr, sizeof(*attr));
-       if (attr == NULL) {
-               CERROR("Out of memory\n");
-               return -ENOMEM;
-       }
-
-       rc = ib_query_device(hdev->ibh_ibdev, attr);
-       if (rc == 0)
-               hdev->ibh_mr_size = attr->max_mr_size;
-
-       LIBCFS_FREE(attr, sizeof(*attr));
-
-       if (rc != 0) {
-               CERROR("Failed to query IB device: %d\n", rc);
-               return rc;
-       }
-
+       hdev->ibh_mr_size = hdev->ibh_ibdev->attrs.max_mr_size;
        if (hdev->ibh_mr_size == ~0ULL) {
                hdev->ibh_mr_shift = 64;
                return 0;
index 7b355319079c3f2b4ca41f944929411747b30b5c..8982f7d1b374823eb8dec18f98b0b1ef167e8f26 100644 (file)
@@ -1858,7 +1858,7 @@ static loff_t ll_dir_seek(struct file *file, loff_t offset, int origin)
        int api32 = ll_need_32bit_api(sbi);
        loff_t ret = -EINVAL;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        switch (origin) {
        case SEEK_SET:
                break;
@@ -1896,7 +1896,7 @@ static loff_t ll_dir_seek(struct file *file, loff_t offset, int origin)
        goto out;
 
 out:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return ret;
 }
 
index c92d58b770ec2bbd3b56c0600a0c2a6aeb8b015e..39e2ffd5f97f1addb2d0b0067285935b30f07185 100644 (file)
@@ -2082,17 +2082,17 @@ putgl:
        /* update time if requested */
        rc = 0;
        if (llss->ia2.ia_valid != 0) {
-               mutex_lock(&llss->inode1->i_mutex);
+               inode_lock(llss->inode1);
                rc = ll_setattr(file1->f_path.dentry, &llss->ia2);
-               mutex_unlock(&llss->inode1->i_mutex);
+               inode_unlock(llss->inode1);
        }
 
        if (llss->ia1.ia_valid != 0) {
                int rc1;
 
-               mutex_lock(&llss->inode2->i_mutex);
+               inode_lock(llss->inode2);
                rc1 = ll_setattr(file2->f_path.dentry, &llss->ia1);
-               mutex_unlock(&llss->inode2->i_mutex);
+               inode_unlock(llss->inode2);
                if (rc == 0)
                        rc = rc1;
        }
@@ -2179,13 +2179,13 @@ static int ll_hsm_import(struct inode *inode, struct file *file,
                         ATTR_MTIME | ATTR_MTIME_SET |
                         ATTR_ATIME | ATTR_ATIME_SET;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        rc = ll_setattr_raw(file->f_path.dentry, attr, true);
        if (rc == -ENODATA)
                rc = 0;
 
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        kfree(attr);
 free_hss:
@@ -2609,7 +2609,7 @@ int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
 
        rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        /* catch async errors that were recorded back when async writeback
         * failed for pages in this mapping. */
@@ -2641,7 +2641,7 @@ int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
                        fd->fd_write_failed = false;
        }
 
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return rc;
 }
 
index ee8a1d67d19119ee034ed5ffa7d64829042aa20d..845e992ca5fcb9898abb4887f70c9165818d4079 100644 (file)
@@ -631,8 +631,6 @@ struct ll_file_data {
 
 struct lov_stripe_md;
 
-extern spinlock_t inode_lock;
-
 extern struct dentry *llite_root;
 extern struct kset *llite_kset;
 
index 1db93af62badd52ecf551ce6b5b3f2d050efd678..b2fc5b3786ee62890956c25f6972ee329489346f 100644 (file)
@@ -1277,7 +1277,7 @@ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import)
                return -ENOMEM;
 
        if (!S_ISDIR(inode->i_mode))
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
 
        memcpy(&op_data->op_attr, attr, sizeof(*attr));
 
@@ -1358,7 +1358,7 @@ out:
        ll_finish_md_op_data(op_data);
 
        if (!S_ISDIR(inode->i_mode)) {
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
                if ((attr->ia_valid & ATTR_SIZE) && !hsm_import)
                        inode_dio_wait(inode);
        }
index e578a1130ad1bfd007ef8c498b06fe4917d07efe..18aab25f9cd9440680c216fc14b340cfc00e5cec 100644 (file)
@@ -245,9 +245,9 @@ static int ll_get_name(struct dentry *dentry, char *name,
                goto out;
        }
 
-       mutex_lock(&dir->i_mutex);
+       inode_lock(dir);
        rc = ll_dir_read(dir, &lgd.ctx);
-       mutex_unlock(&dir->i_mutex);
+       inode_unlock(dir);
        if (!rc && !lgd.lgd_found)
                rc = -ENOENT;
 out:
index 420d39123877b61c92c24d40dbe4ac37e5e0fc82..871924b3f2e72f44a8360cb1e6638d1c13b0eee3 100644 (file)
@@ -257,9 +257,9 @@ static int do_bio_lustrebacked(struct lloop_device *lo, struct bio *head)
         *    be asked to write less pages once, this purely depends on
         *    implementation. Anyway, we should be careful to avoid deadlocking.
         */
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        bytes = ll_direct_rw_pages(env, io, rw, inode, pvec);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        cl_io_fini(env, io);
        return (bytes == pvec->ldp_size) ? 0 : (int)bytes;
 }
index 95cdb0c58b04dffaa784cd04ef07f6f36fad004e..f355474967d62617319859e8df2830795a0a5458 100644 (file)
@@ -115,8 +115,8 @@ static struct ll_cl_context *ll_cl_init(struct file *file,
                struct inode *inode = vmpage->mapping->host;
                loff_t pos;
 
-               if (mutex_trylock(&inode->i_mutex)) {
-                       mutex_unlock(&(inode)->i_mutex);
+               if (inode_trylock(inode)) {
+                       inode_unlock((inode));
 
                        /* this is too bad. Someone is trying to write the
                         * page w/o holding inode mutex. This means we can
index 39fa13b74cbdac3f82f128aee9837063d253183a..711fda93a58df7af9a6dbfe7e0403c0ef9d0700f 100644 (file)
@@ -403,7 +403,7 @@ static ssize_t ll_direct_IO_26(struct kiocb *iocb, struct iov_iter *iter,
         * 1. Need inode mutex to operate transient pages.
         */
        if (iov_iter_rw(iter) == READ)
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
 
        LASSERT(obj->cob_transient_pages == 0);
        while (iov_iter_count(iter)) {
@@ -454,7 +454,7 @@ static ssize_t ll_direct_IO_26(struct kiocb *iocb, struct iov_iter *iter,
 out:
        LASSERT(obj->cob_transient_pages == 0);
        if (iov_iter_rw(iter) == READ)
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
 
        if (tot_bytes > 0) {
                if (iov_iter_rw(iter) == WRITE) {
index f68e972886caeb510b53f7704dc7ca85a8f1df90..0920ac6b3003afcf0a973a8a25f91c87d3900513 100644 (file)
@@ -439,7 +439,7 @@ static int vvp_io_setattr_start(const struct lu_env *env,
        struct inode    *inode = ccc_object_inode(io->ci_obj);
        int result = 0;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        if (cl_io_is_trunc(io))
                result = vvp_io_setattr_trunc(env, ios, inode,
                                        io->u.ci_setattr.sa_attr.lvb_size);
@@ -459,7 +459,7 @@ static void vvp_io_setattr_end(const struct lu_env *env,
                 * because osc has already notified to destroy osc_extents. */
                vvp_do_vmtruncate(inode, io->u.ci_setattr.sa_attr.lvb_size);
 
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 }
 
 static void vvp_io_setattr_fini(const struct lu_env *env,
index 99c0d7aee921d884d7ebc81e2d2658b978c88930..a133475a7c74af7a93fd0ffe2fac4294db284f0c 100644 (file)
@@ -428,7 +428,7 @@ static void vvp_transient_page_verify(const struct cl_page *page)
 {
        struct inode *inode = ccc_object_inode(page->cp_obj);
 
-       LASSERT(!mutex_trylock(&inode->i_mutex));
+       LASSERT(!inode_trylock(inode));
 }
 
 static int vvp_transient_page_own(const struct lu_env *env,
@@ -480,9 +480,9 @@ static int vvp_transient_page_is_vmlocked(const struct lu_env *env,
        struct inode    *inode = ccc_object_inode(slice->cpl_obj);
        int     locked;
 
-       locked = !mutex_trylock(&inode->i_mutex);
+       locked = !inode_trylock(inode);
        if (!locked)
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
        return locked ? -EBUSY : -ENODATA;
 }
 
@@ -502,7 +502,7 @@ static void vvp_transient_page_fini(const struct lu_env *env,
        struct ccc_object *clobj = cl2ccc(clp->cp_obj);
 
        vvp_page_fini_common(cp);
-       LASSERT(!mutex_trylock(&clobj->cob_inode->i_mutex));
+       LASSERT(!inode_trylock(clobj->cob_inode));
        clobj->cob_transient_pages--;
 }
 
@@ -548,7 +548,7 @@ int vvp_page_init(const struct lu_env *env, struct cl_object *obj,
        } else {
                struct ccc_object *clobj = cl2ccc(obj);
 
-               LASSERT(!mutex_trylock(&clobj->cob_inode->i_mutex));
+               LASSERT(!inode_trylock(clobj->cob_inode));
                cl_page_slice_add(page, &cpg->cpg_cl, obj,
                                &vvp_transient_page_ops);
                clobj->cob_transient_pages++;
index 3ef881f2da0fb1557a8e3642d72604196dc97f6d..7ad0c082485a3c94bbccbf7347747c8537e0f906 100644 (file)
@@ -173,9 +173,6 @@ static inline int c2_poll_one(struct c2_dev *c2dev,
        case C2_WR_TYPE_RDMA_READ:
                entry->opcode = IB_WC_RDMA_READ;
                break;
-       case C2_WR_TYPE_BIND_MW:
-               entry->opcode = IB_WC_BIND_MW;
-               break;
        case C2_WR_TYPE_RECV:
                entry->byte_len = be32_to_cpu(ce->bytes_rcvd);
                entry->opcode = IB_WC_RECV;
index a092ac743c723e670947e00c9bff7710aca45f4e..de8d10e1bde3169576e90b2ee800d2d98022ac29 100644 (file)
@@ -337,43 +337,21 @@ static inline u32 c2_convert_access(int acc)
            C2_ACF_LOCAL_READ | C2_ACF_WINDOW_BIND;
 }
 
-static struct ib_mr *c2_reg_phys_mr(struct ib_pd *ib_pd,
-                                   struct ib_phys_buf *buffer_list,
-                                   int num_phys_buf, int acc, u64 * iova_start)
+static struct ib_mr *c2_get_dma_mr(struct ib_pd *pd, int acc)
 {
        struct c2_mr *mr;
        u64 *page_list;
-       u32 total_len;
-       int err, i, j, k, page_shift, pbl_depth;
+       const u32 total_len = 0xffffffff;       /* AMSO1100 limit */
+       int err, page_shift, pbl_depth, i;
+       u64 kva = 0;
 
-       pbl_depth = 0;
-       total_len = 0;
+       pr_debug("%s:%u\n", __func__, __LINE__);
 
-       page_shift = PAGE_SHIFT;
        /*
-        * If there is only 1 buffer we assume this could
-        * be a map of all phy mem...use a 32k page_shift.
+        * This is a map of all phy mem...use a 32k page_shift.
         */
-       if (num_phys_buf == 1)
-               page_shift += 3;
-
-       for (i = 0; i < num_phys_buf; i++) {
-
-               if (offset_in_page(buffer_list[i].addr)) {
-                       pr_debug("Unaligned Memory Buffer: 0x%x\n",
-                               (unsigned int) buffer_list[i].addr);
-                       return ERR_PTR(-EINVAL);
-               }
-
-               if (!buffer_list[i].size) {
-                       pr_debug("Invalid Buffer Size\n");
-                       return ERR_PTR(-EINVAL);
-               }
-
-               total_len += buffer_list[i].size;
-               pbl_depth += ALIGN(buffer_list[i].size,
-                                  BIT(page_shift)) >> page_shift;
-       }
+       page_shift = PAGE_SHIFT + 3;
+       pbl_depth = ALIGN(total_len, BIT(page_shift)) >> page_shift;
 
        page_list = vmalloc(sizeof(u64) * pbl_depth);
        if (!page_list) {
@@ -382,16 +360,8 @@ static struct ib_mr *c2_reg_phys_mr(struct ib_pd *ib_pd,
                return ERR_PTR(-ENOMEM);
        }
 
-       for (i = 0, j = 0; i < num_phys_buf; i++) {
-
-               int naddrs;
-
-               naddrs = ALIGN(buffer_list[i].size,
-                              BIT(page_shift)) >> page_shift;
-               for (k = 0; k < naddrs; k++)
-                       page_list[j++] = (buffer_list[i].addr +
-                                                    (k << page_shift));
-       }
+       for (i = 0; i < pbl_depth; i++)
+               page_list[i] = (i << page_shift);
 
        mr = kmalloc(sizeof(*mr), GFP_KERNEL);
        if (!mr) {
@@ -399,17 +369,17 @@ static struct ib_mr *c2_reg_phys_mr(struct ib_pd *ib_pd,
                return ERR_PTR(-ENOMEM);
        }
 
-       mr->pd = to_c2pd(ib_pd);
+       mr->pd = to_c2pd(pd);
        mr->umem = NULL;
        pr_debug("%s - page shift %d, pbl_depth %d, total_len %u, "
                "*iova_start %llx, first pa %llx, last pa %llx\n",
                __func__, page_shift, pbl_depth, total_len,
-               (unsigned long long) *iova_start,
+               (unsigned long long) kva,
                (unsigned long long) page_list[0],
                (unsigned long long) page_list[pbl_depth-1]);
-       err = c2_nsmr_register_phys_kern(to_c2dev(ib_pd->device), page_list,
+       err = c2_nsmr_register_phys_kern(to_c2dev(pd->device), page_list,
                                         BIT(page_shift), pbl_depth,
-                                        total_len, 0, iova_start,
+                                        total_len, 0, &kva,
                                         c2_convert_access(acc), mr);
        vfree(page_list);
        if (err) {
@@ -420,19 +390,6 @@ static struct ib_mr *c2_reg_phys_mr(struct ib_pd *ib_pd,
        return &mr->ibmr;
 }
 
-static struct ib_mr *c2_get_dma_mr(struct ib_pd *pd, int acc)
-{
-       struct ib_phys_buf bl;
-       u64 kva = 0;
-
-       pr_debug("%s:%u\n", __func__, __LINE__);
-
-       /* AMSO1100 limit */
-       bl.size = 0xffffffff;
-       bl.addr = 0;
-       return c2_reg_phys_mr(pd, &bl, 1, acc, &kva);
-}
-
 static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
                                    u64 virt, int acc, struct ib_udata *udata)
 {
@@ -840,7 +797,6 @@ int c2_register_device(struct c2_dev *dev)
        dev->ibdev.destroy_cq = c2_destroy_cq;
        dev->ibdev.poll_cq = c2_poll_cq;
        dev->ibdev.get_dma_mr = c2_get_dma_mr;
-       dev->ibdev.reg_phys_mr = c2_reg_phys_mr;
        dev->ibdev.reg_user_mr = c2_reg_user_mr;
        dev->ibdev.dereg_mr = c2_dereg_mr;
        dev->ibdev.get_port_immutable = c2_port_immutable;
index bd45e0f3923fa6d9ce4b06202da54852ba78916f..e8c3387d7aaa04e2275d0dce0349ed5211fed9d0 100644 (file)
@@ -316,9 +316,8 @@ struct ehca_mr_pginfo {
 
        union {
                struct { /* type EHCA_MR_PGI_PHYS section */
-                       int num_phys_buf;
-                       struct ib_phys_buf *phys_buf_array;
-                       u64 next_buf;
+                       u64 addr;
+                       u16 size;
                } phy;
                struct { /* type EHCA_MR_PGI_USER section */
                        struct ib_umem *region;
index 80e6a3d5df3e035ea1afdc3ec6bc013050ec9e3e..cca5933fcda6ea35256711ccdb5338bf2a173a5c 100644 (file)
@@ -80,30 +80,14 @@ int ehca_destroy_ah(struct ib_ah *ah);
 
 struct ib_mr *ehca_get_dma_mr(struct ib_pd *pd, int mr_access_flags);
 
-struct ib_mr *ehca_reg_phys_mr(struct ib_pd *pd,
-                              struct ib_phys_buf *phys_buf_array,
-                              int num_phys_buf,
-                              int mr_access_flags, u64 *iova_start);
-
 struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
                               u64 virt, int mr_access_flags,
                               struct ib_udata *udata);
 
-int ehca_rereg_phys_mr(struct ib_mr *mr,
-                      int mr_rereg_mask,
-                      struct ib_pd *pd,
-                      struct ib_phys_buf *phys_buf_array,
-                      int num_phys_buf, int mr_access_flags, u64 *iova_start);
-
-int ehca_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr);
-
 int ehca_dereg_mr(struct ib_mr *mr);
 
 struct ib_mw *ehca_alloc_mw(struct ib_pd *pd, enum ib_mw_type type);
 
-int ehca_bind_mw(struct ib_qp *qp, struct ib_mw *mw,
-                struct ib_mw_bind *mw_bind);
-
 int ehca_dealloc_mw(struct ib_mw *mw);
 
 struct ib_fmr *ehca_alloc_fmr(struct ib_pd *pd,
index 860b974e9faab6107aaecf748433fb415684024f..832f22f408629263b2cd6487fcb488bb35e5cae5 100644 (file)
@@ -511,13 +511,9 @@ static int ehca_init_device(struct ehca_shca *shca)
        shca->ib_device.req_notify_cq       = ehca_req_notify_cq;
        /* shca->ib_device.req_ncomp_notif  = ehca_req_ncomp_notif; */
        shca->ib_device.get_dma_mr          = ehca_get_dma_mr;
-       shca->ib_device.reg_phys_mr         = ehca_reg_phys_mr;
        shca->ib_device.reg_user_mr         = ehca_reg_user_mr;
-       shca->ib_device.query_mr            = ehca_query_mr;
        shca->ib_device.dereg_mr            = ehca_dereg_mr;
-       shca->ib_device.rereg_phys_mr       = ehca_rereg_phys_mr;
        shca->ib_device.alloc_mw            = ehca_alloc_mw;
-       shca->ib_device.bind_mw             = ehca_bind_mw;
        shca->ib_device.dealloc_mw          = ehca_dealloc_mw;
        shca->ib_device.alloc_fmr           = ehca_alloc_fmr;
        shca->ib_device.map_phys_fmr        = ehca_map_phys_fmr;
index 553e883a5718ee0102f4923c5283e7d5ddded6a6..3367205e3160205a566f9672c3700540f03e35ad 100644 (file)
@@ -196,120 +196,6 @@ get_dma_mr_exit0:
 
 /*----------------------------------------------------------------------*/
 
-struct ib_mr *ehca_reg_phys_mr(struct ib_pd *pd,
-                              struct ib_phys_buf *phys_buf_array,
-                              int num_phys_buf,
-                              int mr_access_flags,
-                              u64 *iova_start)
-{
-       struct ib_mr *ib_mr;
-       int ret;
-       struct ehca_mr *e_mr;
-       struct ehca_shca *shca =
-               container_of(pd->device, struct ehca_shca, ib_device);
-       struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
-
-       u64 size;
-
-       if ((num_phys_buf <= 0) || !phys_buf_array) {
-               ehca_err(pd->device, "bad input values: num_phys_buf=%x "
-                        "phys_buf_array=%p", num_phys_buf, phys_buf_array);
-               ib_mr = ERR_PTR(-EINVAL);
-               goto reg_phys_mr_exit0;
-       }
-       if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
-            !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
-           ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
-            !(mr_access_flags & IB_ACCESS_LOCAL_WRITE))) {
-               /*
-                * Remote Write Access requires Local Write Access
-                * Remote Atomic Access requires Local Write Access
-                */
-               ehca_err(pd->device, "bad input values: mr_access_flags=%x",
-                        mr_access_flags);
-               ib_mr = ERR_PTR(-EINVAL);
-               goto reg_phys_mr_exit0;
-       }
-
-       /* check physical buffer list and calculate size */
-       ret = ehca_mr_chk_buf_and_calc_size(phys_buf_array, num_phys_buf,
-                                           iova_start, &size);
-       if (ret) {
-               ib_mr = ERR_PTR(ret);
-               goto reg_phys_mr_exit0;
-       }
-       if ((size == 0) ||
-           (((u64)iova_start + size) < (u64)iova_start)) {
-               ehca_err(pd->device, "bad input values: size=%llx iova_start=%p",
-                        size, iova_start);
-               ib_mr = ERR_PTR(-EINVAL);
-               goto reg_phys_mr_exit0;
-       }
-
-       e_mr = ehca_mr_new();
-       if (!e_mr) {
-               ehca_err(pd->device, "out of memory");
-               ib_mr = ERR_PTR(-ENOMEM);
-               goto reg_phys_mr_exit0;
-       }
-
-       /* register MR on HCA */
-       if (ehca_mr_is_maxmr(size, iova_start)) {
-               e_mr->flags |= EHCA_MR_FLAG_MAXMR;
-               ret = ehca_reg_maxmr(shca, e_mr, iova_start, mr_access_flags,
-                                    e_pd, &e_mr->ib.ib_mr.lkey,
-                                    &e_mr->ib.ib_mr.rkey);
-               if (ret) {
-                       ib_mr = ERR_PTR(ret);
-                       goto reg_phys_mr_exit1;
-               }
-       } else {
-               struct ehca_mr_pginfo pginfo;
-               u32 num_kpages;
-               u32 num_hwpages;
-               u64 hw_pgsize;
-
-               num_kpages = NUM_CHUNKS(((u64)iova_start % PAGE_SIZE) + size,
-                                       PAGE_SIZE);
-               /* for kernel space we try most possible pgsize */
-               hw_pgsize = ehca_get_max_hwpage_size(shca);
-               num_hwpages = NUM_CHUNKS(((u64)iova_start % hw_pgsize) + size,
-                                        hw_pgsize);
-               memset(&pginfo, 0, sizeof(pginfo));
-               pginfo.type = EHCA_MR_PGI_PHYS;
-               pginfo.num_kpages = num_kpages;
-               pginfo.hwpage_size = hw_pgsize;
-               pginfo.num_hwpages = num_hwpages;
-               pginfo.u.phy.num_phys_buf = num_phys_buf;
-               pginfo.u.phy.phys_buf_array = phys_buf_array;
-               pginfo.next_hwpage =
-                       ((u64)iova_start & ~PAGE_MASK) / hw_pgsize;
-
-               ret = ehca_reg_mr(shca, e_mr, iova_start, size, mr_access_flags,
-                                 e_pd, &pginfo, &e_mr->ib.ib_mr.lkey,
-                                 &e_mr->ib.ib_mr.rkey, EHCA_REG_MR);
-               if (ret) {
-                       ib_mr = ERR_PTR(ret);
-                       goto reg_phys_mr_exit1;
-               }
-       }
-
-       /* successful registration of all pages */
-       return &e_mr->ib.ib_mr;
-
-reg_phys_mr_exit1:
-       ehca_mr_delete(e_mr);
-reg_phys_mr_exit0:
-       if (IS_ERR(ib_mr))
-               ehca_err(pd->device, "h_ret=%li pd=%p phys_buf_array=%p "
-                        "num_phys_buf=%x mr_access_flags=%x iova_start=%p",
-                        PTR_ERR(ib_mr), pd, phys_buf_array,
-                        num_phys_buf, mr_access_flags, iova_start);
-       return ib_mr;
-} /* end ehca_reg_phys_mr() */
-
-/*----------------------------------------------------------------------*/
-
 struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
                               u64 virt, int mr_access_flags,
                               struct ib_udata *udata)
@@ -437,207 +323,6 @@ reg_user_mr_exit0:
 
 /*----------------------------------------------------------------------*/
 
-int ehca_rereg_phys_mr(struct ib_mr *mr,
-                      int mr_rereg_mask,
-                      struct ib_pd *pd,
-                      struct ib_phys_buf *phys_buf_array,
-                      int num_phys_buf,
-                      int mr_access_flags,
-                      u64 *iova_start)
-{
-       int ret;
-
-       struct ehca_shca *shca =
-               container_of(mr->device, struct ehca_shca, ib_device);
-       struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr);
-       u64 new_size;
-       u64 *new_start;
-       u32 new_acl;
-       struct ehca_pd *new_pd;
-       u32 tmp_lkey, tmp_rkey;
-       unsigned long sl_flags;
-       u32 num_kpages = 0;
-       u32 num_hwpages = 0;
-       struct ehca_mr_pginfo pginfo;
-
-       if (!(mr_rereg_mask & IB_MR_REREG_TRANS)) {
-               /* TODO not supported, because PHYP rereg hCall needs pages */
-               ehca_err(mr->device, "rereg without IB_MR_REREG_TRANS not "
-                        "supported yet, mr_rereg_mask=%x", mr_rereg_mask);
-               ret = -EINVAL;
-               goto rereg_phys_mr_exit0;
-       }
-
-       if (mr_rereg_mask & IB_MR_REREG_PD) {
-               if (!pd) {
-                       ehca_err(mr->device, "rereg with bad pd, pd=%p "
-                                "mr_rereg_mask=%x", pd, mr_rereg_mask);
-                       ret = -EINVAL;
-                       goto rereg_phys_mr_exit0;
-               }
-       }
-
-       if ((mr_rereg_mask &
-            ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS)) ||
-           (mr_rereg_mask == 0)) {
-               ret = -EINVAL;
-               goto rereg_phys_mr_exit0;
-       }
-
-       /* check other parameters */
-       if (e_mr == shca->maxmr) {
-               /* should be impossible, however reject to be sure */
-               ehca_err(mr->device, "rereg internal max-MR impossible, mr=%p "
-                        "shca->maxmr=%p mr->lkey=%x",
-                        mr, shca->maxmr, mr->lkey);
-               ret = -EINVAL;
-               goto rereg_phys_mr_exit0;
-       }
-       if (mr_rereg_mask & IB_MR_REREG_TRANS) { /* transl., i.e. addr/size */
-               if (e_mr->flags & EHCA_MR_FLAG_FMR) {
-                       ehca_err(mr->device, "not supported for FMR, mr=%p "
-                                "flags=%x", mr, e_mr->flags);
-                       ret = -EINVAL;
-                       goto rereg_phys_mr_exit0;
-               }
-               if (!phys_buf_array || num_phys_buf <= 0) {
-                       ehca_err(mr->device, "bad input values mr_rereg_mask=%x"
-                                " phys_buf_array=%p num_phys_buf=%x",
-                                mr_rereg_mask, phys_buf_array, num_phys_buf);
-                       ret = -EINVAL;
-                       goto rereg_phys_mr_exit0;
-               }
-       }
-       if ((mr_rereg_mask & IB_MR_REREG_ACCESS) &&     /* change ACL */
-           (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
-             !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
-            ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
-             !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)))) {
-               /*
-                * Remote Write Access requires Local Write Access
-                * Remote Atomic Access requires Local Write Access
-                */
-               ehca_err(mr->device, "bad input values: mr_rereg_mask=%x "
-                        "mr_access_flags=%x", mr_rereg_mask, mr_access_flags);
-               ret = -EINVAL;
-               goto rereg_phys_mr_exit0;
-       }
-
-       /* set requested values dependent on rereg request */
-       spin_lock_irqsave(&e_mr->mrlock, sl_flags);
-       new_start = e_mr->start;
-       new_size = e_mr->size;
-       new_acl = e_mr->acl;
-       new_pd = container_of(mr->pd, struct ehca_pd, ib_pd);
-
-       if (mr_rereg_mask & IB_MR_REREG_TRANS) {
-               u64 hw_pgsize = ehca_get_max_hwpage_size(shca);
-
-               new_start = iova_start; /* change address */
-               /* check physical buffer list and calculate size */
-               ret = ehca_mr_chk_buf_and_calc_size(phys_buf_array,
-                                                   num_phys_buf, iova_start,
-                                                   &new_size);
-               if (ret)
-                       goto rereg_phys_mr_exit1;
-               if ((new_size == 0) ||
-                   (((u64)iova_start + new_size) < (u64)iova_start)) {
-                       ehca_err(mr->device, "bad input values: new_size=%llx "
-                                "iova_start=%p", new_size, iova_start);
-                       ret = -EINVAL;
-                       goto rereg_phys_mr_exit1;
-               }
-               num_kpages = NUM_CHUNKS(((u64)new_start % PAGE_SIZE) +
-                                       new_size, PAGE_SIZE);
-               num_hwpages = NUM_CHUNKS(((u64)new_start % hw_pgsize) +
-                                        new_size, hw_pgsize);
-               memset(&pginfo, 0, sizeof(pginfo));
-               pginfo.type = EHCA_MR_PGI_PHYS;
-               pginfo.num_kpages = num_kpages;
-               pginfo.hwpage_size = hw_pgsize;
-               pginfo.num_hwpages = num_hwpages;
-               pginfo.u.phy.num_phys_buf = num_phys_buf;
-               pginfo.u.phy.phys_buf_array = phys_buf_array;
-               pginfo.next_hwpage =
-                       ((u64)iova_start & ~PAGE_MASK) / hw_pgsize;
-       }
-       if (mr_rereg_mask & IB_MR_REREG_ACCESS)
-               new_acl = mr_access_flags;
-       if (mr_rereg_mask & IB_MR_REREG_PD)
-               new_pd = container_of(pd, struct ehca_pd, ib_pd);
-
-       ret = ehca_rereg_mr(shca, e_mr, new_start, new_size, new_acl,
-                           new_pd, &pginfo, &tmp_lkey, &tmp_rkey);
-       if (ret)
-               goto rereg_phys_mr_exit1;
-
-       /* successful reregistration */
-       if (mr_rereg_mask & IB_MR_REREG_PD)
-               mr->pd = pd;
-       mr->lkey = tmp_lkey;
-       mr->rkey = tmp_rkey;
-
-rereg_phys_mr_exit1:
-       spin_unlock_irqrestore(&e_mr->mrlock, sl_flags);
-rereg_phys_mr_exit0:
-       if (ret)
-               ehca_err(mr->device, "ret=%i mr=%p mr_rereg_mask=%x pd=%p "
-                        "phys_buf_array=%p num_phys_buf=%x mr_access_flags=%x "
-                        "iova_start=%p",
-                        ret, mr, mr_rereg_mask, pd, phys_buf_array,
-                        num_phys_buf, mr_access_flags, iova_start);
-       return ret;
-} /* end ehca_rereg_phys_mr() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)
-{
-       int ret = 0;
-       u64 h_ret;
-       struct ehca_shca *shca =
-               container_of(mr->device, struct ehca_shca, ib_device);
-       struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr);
-       unsigned long sl_flags;
-       struct ehca_mr_hipzout_parms hipzout;
-
-       if ((e_mr->flags & EHCA_MR_FLAG_FMR)) {
-               ehca_err(mr->device, "not supported for FMR, mr=%p e_mr=%p "
-                        "e_mr->flags=%x", mr, e_mr, e_mr->flags);
-               ret = -EINVAL;
-               goto query_mr_exit0;
-       }
-
-       memset(mr_attr, 0, sizeof(struct ib_mr_attr));
-       spin_lock_irqsave(&e_mr->mrlock, sl_flags);
-
-       h_ret = hipz_h_query_mr(shca->ipz_hca_handle, e_mr, &hipzout);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(mr->device, "hipz_mr_query failed, h_ret=%lli mr=%p "
-                        "hca_hndl=%llx mr_hndl=%llx lkey=%x",
-                        h_ret, mr, shca->ipz_hca_handle.handle,
-                        e_mr->ipz_mr_handle.handle, mr->lkey);
-               ret = ehca2ib_return_code(h_ret);
-               goto query_mr_exit1;
-       }
-       mr_attr->pd = mr->pd;
-       mr_attr->device_virt_addr = hipzout.vaddr;
-       mr_attr->size = hipzout.len;
-       mr_attr->lkey = hipzout.lkey;
-       mr_attr->rkey = hipzout.rkey;
-       ehca_mrmw_reverse_map_acl(&hipzout.acl, &mr_attr->mr_access_flags);
-
-query_mr_exit1:
-       spin_unlock_irqrestore(&e_mr->mrlock, sl_flags);
-query_mr_exit0:
-       if (ret)
-               ehca_err(mr->device, "ret=%i mr=%p mr_attr=%p",
-                        ret, mr, mr_attr);
-       return ret;
-} /* end ehca_query_mr() */
-
-/*----------------------------------------------------------------------*/
-
 int ehca_dereg_mr(struct ib_mr *mr)
 {
        int ret = 0;
@@ -728,18 +413,6 @@ alloc_mw_exit0:
 
 /*----------------------------------------------------------------------*/
 
-int ehca_bind_mw(struct ib_qp *qp,
-                struct ib_mw *mw,
-                struct ib_mw_bind *mw_bind)
-{
-       /* TODO: not supported up to now */
-       ehca_gen_err("bind MW currently not supported by HCAD");
-
-       return -EPERM;
-} /* end ehca_bind_mw() */
-
-/*----------------------------------------------------------------------*/
-
 int ehca_dealloc_mw(struct ib_mw *mw)
 {
        u64 h_ret;
@@ -1616,7 +1289,6 @@ int ehca_reg_internal_maxmr(
        u64 *iova_start;
        u64 size_maxmr;
        struct ehca_mr_pginfo pginfo;
-       struct ib_phys_buf ib_pbuf;
        u32 num_kpages;
        u32 num_hwpages;
        u64 hw_pgsize;
@@ -1637,8 +1309,6 @@ int ehca_reg_internal_maxmr(
        /* register internal max-MR on HCA */
        size_maxmr = ehca_mr_len;
        iova_start = (u64 *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START));
-       ib_pbuf.addr = 0;
-       ib_pbuf.size = size_maxmr;
        num_kpages = NUM_CHUNKS(((u64)iova_start % PAGE_SIZE) + size_maxmr,
                                PAGE_SIZE);
        hw_pgsize = ehca_get_max_hwpage_size(shca);
@@ -1650,8 +1320,8 @@ int ehca_reg_internal_maxmr(
        pginfo.num_kpages = num_kpages;
        pginfo.num_hwpages = num_hwpages;
        pginfo.hwpage_size = hw_pgsize;
-       pginfo.u.phy.num_phys_buf = 1;
-       pginfo.u.phy.phys_buf_array = &ib_pbuf;
+       pginfo.u.phy.addr = 0;
+       pginfo.u.phy.size = size_maxmr;
 
        ret = ehca_reg_mr(shca, e_mr, iova_start, size_maxmr, 0, e_pd,
                          &pginfo, &e_mr->ib.ib_mr.lkey,
@@ -1669,7 +1339,6 @@ int ehca_reg_internal_maxmr(
        e_mr->ib.ib_mr.pd = &e_pd->ib_pd;
        e_mr->ib.ib_mr.uobject = NULL;
        atomic_inc(&(e_pd->ib_pd.usecnt));
-       atomic_set(&(e_mr->ib.ib_mr.usecnt), 0);
        *e_maxmr = e_mr;
        return 0;
 
@@ -1762,61 +1431,6 @@ ehca_dereg_internal_maxmr_exit0:
 
 /*----------------------------------------------------------------------*/
 
-/*
- * check physical buffer array of MR verbs for validness and
- * calculates MR size
- */
-int ehca_mr_chk_buf_and_calc_size(struct ib_phys_buf *phys_buf_array,
-                                 int num_phys_buf,
-                                 u64 *iova_start,
-                                 u64 *size)
-{
-       struct ib_phys_buf *pbuf = phys_buf_array;
-       u64 size_count = 0;
-       u32 i;
-
-       if (num_phys_buf == 0) {
-               ehca_gen_err("bad phys buf array len, num_phys_buf=0");
-               return -EINVAL;
-       }
-       /* check first buffer */
-       if (((u64)iova_start & ~PAGE_MASK) != (pbuf->addr & ~PAGE_MASK)) {
-               ehca_gen_err("iova_start/addr mismatch, iova_start=%p "
-                            "pbuf->addr=%llx pbuf->size=%llx",
-                            iova_start, pbuf->addr, pbuf->size);
-               return -EINVAL;
-       }
-       if (((pbuf->addr + pbuf->size) % PAGE_SIZE) &&
-           (num_phys_buf > 1)) {
-               ehca_gen_err("addr/size mismatch in 1st buf, pbuf->addr=%llx "
-                            "pbuf->size=%llx", pbuf->addr, pbuf->size);
-               return -EINVAL;
-       }
-
-       for (i = 0; i < num_phys_buf; i++) {
-               if ((i > 0) && (pbuf->addr % PAGE_SIZE)) {
-                       ehca_gen_err("bad address, i=%x pbuf->addr=%llx "
-                                    "pbuf->size=%llx",
-                                    i, pbuf->addr, pbuf->size);
-                       return -EINVAL;
-               }
-               if (((i > 0) && /* not 1st */
-                    (i < (num_phys_buf - 1)) &&        /* not last */
-                    (pbuf->size % PAGE_SIZE)) || (pbuf->size == 0)) {
-                       ehca_gen_err("bad size, i=%x pbuf->size=%llx",
-                                    i, pbuf->size);
-                       return -EINVAL;
-               }
-               size_count += pbuf->size;
-               pbuf++;
-       }
-
-       *size = size_count;
-       return 0;
-} /* end ehca_mr_chk_buf_and_calc_size() */
-
-/*----------------------------------------------------------------------*/
-
 /* check page list of map FMR verb for validness */
 int ehca_fmr_check_page_list(struct ehca_mr *e_fmr,
                             u64 *page_list,
@@ -2002,57 +1616,54 @@ static int ehca_set_pagebuf_phys(struct ehca_mr_pginfo *pginfo,
                                 u32 number, u64 *kpage)
 {
        int ret = 0;
-       struct ib_phys_buf *pbuf;
+       u64 addr = pginfo->u.phy.addr;
+       u64 size = pginfo->u.phy.size;
        u64 num_hw, offs_hw;
        u32 i = 0;
 
-       /* loop over desired phys_buf_array entries */
-       while (i < number) {
-               pbuf   = pginfo->u.phy.phys_buf_array + pginfo->u.phy.next_buf;
-               num_hw  = NUM_CHUNKS((pbuf->addr % pginfo->hwpage_size) +
-                                    pbuf->size, pginfo->hwpage_size);
-               offs_hw = (pbuf->addr & ~(pginfo->hwpage_size - 1)) /
-                       pginfo->hwpage_size;
-               while (pginfo->next_hwpage < offs_hw + num_hw) {
-                       /* sanity check */
-                       if ((pginfo->kpage_cnt >= pginfo->num_kpages) ||
-                           (pginfo->hwpage_cnt >= pginfo->num_hwpages)) {
-                               ehca_gen_err("kpage_cnt >= num_kpages, "
-                                            "kpage_cnt=%llx num_kpages=%llx "
-                                            "hwpage_cnt=%llx "
-                                            "num_hwpages=%llx i=%x",
-                                            pginfo->kpage_cnt,
-                                            pginfo->num_kpages,
-                                            pginfo->hwpage_cnt,
-                                            pginfo->num_hwpages, i);
-                               return -EFAULT;
-                       }
-                       *kpage = (pbuf->addr & ~(pginfo->hwpage_size - 1)) +
-                                (pginfo->next_hwpage * pginfo->hwpage_size);
-                       if ( !(*kpage) && pbuf->addr ) {
-                               ehca_gen_err("pbuf->addr=%llx pbuf->size=%llx "
-                                            "next_hwpage=%llx", pbuf->addr,
-                                            pbuf->size, pginfo->next_hwpage);
-                               return -EFAULT;
-                       }
-                       (pginfo->hwpage_cnt)++;
-                       (pginfo->next_hwpage)++;
-                       if (PAGE_SIZE >= pginfo->hwpage_size) {
-                               if (pginfo->next_hwpage %
-                                   (PAGE_SIZE / pginfo->hwpage_size) == 0)
-                                       (pginfo->kpage_cnt)++;
-                       } else
-                               pginfo->kpage_cnt += pginfo->hwpage_size /
-                                       PAGE_SIZE;
-                       kpage++;
-                       i++;
-                       if (i >= number) break;
+       num_hw  = NUM_CHUNKS((addr % pginfo->hwpage_size) + size,
+                               pginfo->hwpage_size);
+       offs_hw = (addr & ~(pginfo->hwpage_size - 1)) / pginfo->hwpage_size;
+
+       while (pginfo->next_hwpage < offs_hw + num_hw) {
+               /* sanity check */
+               if ((pginfo->kpage_cnt >= pginfo->num_kpages) ||
+                   (pginfo->hwpage_cnt >= pginfo->num_hwpages)) {
+                       ehca_gen_err("kpage_cnt >= num_kpages, "
+                                    "kpage_cnt=%llx num_kpages=%llx "
+                                    "hwpage_cnt=%llx "
+                                    "num_hwpages=%llx i=%x",
+                                    pginfo->kpage_cnt,
+                                    pginfo->num_kpages,
+                                    pginfo->hwpage_cnt,
+                                    pginfo->num_hwpages, i);
+                       return -EFAULT;
                }
-               if (pginfo->next_hwpage >= offs_hw + num_hw) {
-                       (pginfo->u.phy.next_buf)++;
-                       pginfo->next_hwpage = 0;
+               *kpage = (addr & ~(pginfo->hwpage_size - 1)) +
+                        (pginfo->next_hwpage * pginfo->hwpage_size);
+               if ( !(*kpage) && addr ) {
+                       ehca_gen_err("addr=%llx size=%llx "
+                                    "next_hwpage=%llx", addr,
+                                    size, pginfo->next_hwpage);
+                       return -EFAULT;
                }
+               (pginfo->hwpage_cnt)++;
+               (pginfo->next_hwpage)++;
+               if (PAGE_SIZE >= pginfo->hwpage_size) {
+                       if (pginfo->next_hwpage %
+                           (PAGE_SIZE / pginfo->hwpage_size) == 0)
+                               (pginfo->kpage_cnt)++;
+               } else
+                       pginfo->kpage_cnt += pginfo->hwpage_size /
+                               PAGE_SIZE;
+               kpage++;
+               i++;
+               if (i >= number) break;
        }
+       if (pginfo->next_hwpage >= offs_hw + num_hw) {
+               pginfo->next_hwpage = 0;
+       }
+
        return ret;
 }
 
index 50d8b51306dd6a7a2b1958fd9edb5ae7e5d15737..52bfa95697f7cdb161e47cfbf24d7315f5c199c9 100644 (file)
@@ -98,11 +98,6 @@ int ehca_reg_maxmr(struct ehca_shca *shca,
 
 int ehca_dereg_internal_maxmr(struct ehca_shca *shca);
 
-int ehca_mr_chk_buf_and_calc_size(struct ib_phys_buf *phys_buf_array,
-                                 int num_phys_buf,
-                                 u64 *iova_start,
-                                 u64 *size);
-
 int ehca_fmr_check_page_list(struct ehca_mr *e_fmr,
                             u64 *page_list,
                             int list_len);
index 10e2074384f5d83019b222e711b96459441bf378..11813b880e16da1014d293f612fc3fa06d60b31b 100644 (file)
@@ -614,7 +614,6 @@ int ehca_post_srq_recv(struct ib_srq *srq,
 static const u8 ib_wc_opcode[255] = {
        [0x01] = IB_WC_RECV+1,
        [0x02] = IB_WC_RECV_RDMA_WITH_IMM+1,
-       [0x04] = IB_WC_BIND_MW+1,
        [0x08] = IB_WC_FETCH_ADD+1,
        [0x10] = IB_WC_COMP_SWAP+1,
        [0x20] = IB_WC_RDMA_WRITE+1,
index 568f185a022dffa5bf24a1af010261a4b4f7b5c0..a3f8b884fdd625fef680f5342655bc1832978eca 100644 (file)
@@ -167,10 +167,7 @@ static struct hfi1_mr *alloc_mr(int count, struct ib_pd *pd)
        rval = init_mregion(&mr->mr, pd, count);
        if (rval)
                goto bail;
-       /*
-        * ib_reg_phys_mr() will initialize mr->ibmr except for
-        * lkey and rkey.
-        */
+
        rval = hfi1_alloc_lkey(&mr->mr, 0);
        if (rval)
                goto bail_mregion;
@@ -187,52 +184,6 @@ bail:
        goto done;
 }
 
-/**
- * hfi1_reg_phys_mr - register a physical memory region
- * @pd: protection domain for this memory region
- * @buffer_list: pointer to the list of physical buffers to register
- * @num_phys_buf: the number of physical buffers to register
- * @iova_start: the starting address passed over IB which maps to this MR
- *
- * Returns the memory region on success, otherwise returns an errno.
- */
-struct ib_mr *hfi1_reg_phys_mr(struct ib_pd *pd,
-                              struct ib_phys_buf *buffer_list,
-                              int num_phys_buf, int acc, u64 *iova_start)
-{
-       struct hfi1_mr *mr;
-       int n, m, i;
-       struct ib_mr *ret;
-
-       mr = alloc_mr(num_phys_buf, pd);
-       if (IS_ERR(mr)) {
-               ret = (struct ib_mr *)mr;
-               goto bail;
-       }
-
-       mr->mr.user_base = *iova_start;
-       mr->mr.iova = *iova_start;
-       mr->mr.access_flags = acc;
-
-       m = 0;
-       n = 0;
-       for (i = 0; i < num_phys_buf; i++) {
-               mr->mr.map[m]->segs[n].vaddr = (void *) buffer_list[i].addr;
-               mr->mr.map[m]->segs[n].length = buffer_list[i].size;
-               mr->mr.length += buffer_list[i].size;
-               n++;
-               if (n == HFI1_SEGSZ) {
-                       m++;
-                       n = 0;
-               }
-       }
-
-       ret = &mr->ibmr;
-
-bail:
-       return ret;
-}
-
 /**
  * hfi1_reg_user_mr - register a userspace memory region
  * @pd: protection domain for this memory region
index ef0feaa684a48b7b285ce490ad9791d98c052ddd..09b8d412ee9085667da5fd3bccf35d46724dc49e 100644 (file)
@@ -2052,7 +2052,6 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
        ibdev->poll_cq = hfi1_poll_cq;
        ibdev->req_notify_cq = hfi1_req_notify_cq;
        ibdev->get_dma_mr = hfi1_get_dma_mr;
-       ibdev->reg_phys_mr = hfi1_reg_phys_mr;
        ibdev->reg_user_mr = hfi1_reg_user_mr;
        ibdev->dereg_mr = hfi1_dereg_mr;
        ibdev->alloc_mr = hfi1_alloc_mr;
index 72106e5362b9862d448be822220f2ee7b0aa30f4..286e468b0479791c49c8b5dfeeb8b991d0ff2785 100644 (file)
@@ -1024,10 +1024,6 @@ int hfi1_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
 
 struct ib_mr *hfi1_get_dma_mr(struct ib_pd *pd, int acc);
 
-struct ib_mr *hfi1_reg_phys_mr(struct ib_pd *pd,
-                              struct ib_phys_buf *buffer_list,
-                              int num_phys_buf, int acc, u64 *iova_start);
-
 struct ib_mr *hfi1_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
                               u64 virt_addr, int mr_access_flags,
                               struct ib_udata *udata);
index 796af686700739c8401bd49c9ea97d43f6c3d5de..476fcdf05acb6c9eeae58cc7884c7521b197200d 100644 (file)
@@ -82,14 +82,14 @@ static int create_file(const char *name, umode_t mode,
 {
        int error;
 
-       mutex_lock(&d_inode(parent)->i_mutex);
+       inode_lock(d_inode(parent));
        *dentry = lookup_one_len(name, parent, strlen(name));
        if (!IS_ERR(*dentry))
                error = ipathfs_mknod(d_inode(parent), *dentry,
                                      mode, fops, data);
        else
                error = PTR_ERR(*dentry);
-       mutex_unlock(&d_inode(parent)->i_mutex);
+       inode_unlock(d_inode(parent));
 
        return error;
 }
@@ -295,7 +295,7 @@ static int remove_device_files(struct super_block *sb,
        int ret;
 
        root = dget(sb->s_root);
-       mutex_lock(&d_inode(root)->i_mutex);
+       inode_lock(d_inode(root));
        snprintf(unit, sizeof unit, "%02d", dd->ipath_unit);
        dir = lookup_one_len(unit, root, strlen(unit));
 
@@ -311,7 +311,7 @@ static int remove_device_files(struct super_block *sb,
        ret = simple_rmdir(d_inode(root), dir);
 
 bail:
-       mutex_unlock(&d_inode(root)->i_mutex);
+       inode_unlock(d_inode(root));
        dput(root);
        return ret;
 }
index c7278f6a8217798c3572366828491aa1c57d5b49..b76b0ce66709b47718bc805d9a3954d76ffebaa0 100644 (file)
@@ -98,10 +98,6 @@ static struct ipath_mr *alloc_mr(int count,
        }
        mr->mr.mapsz = m;
 
-       /*
-        * ib_reg_phys_mr() will initialize mr->ibmr except for
-        * lkey and rkey.
-        */
        if (!ipath_alloc_lkey(lk_table, &mr->mr))
                goto bail;
        mr->ibmr.rkey = mr->ibmr.lkey = mr->mr.lkey;
@@ -120,57 +116,6 @@ done:
        return mr;
 }
 
-/**
- * ipath_reg_phys_mr - register a physical memory region
- * @pd: protection domain for this memory region
- * @buffer_list: pointer to the list of physical buffers to register
- * @num_phys_buf: the number of physical buffers to register
- * @iova_start: the starting address passed over IB which maps to this MR
- *
- * Returns the memory region on success, otherwise returns an errno.
- */
-struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd,
-                               struct ib_phys_buf *buffer_list,
-                               int num_phys_buf, int acc, u64 *iova_start)
-{
-       struct ipath_mr *mr;
-       int n, m, i;
-       struct ib_mr *ret;
-
-       mr = alloc_mr(num_phys_buf, &to_idev(pd->device)->lk_table);
-       if (mr == NULL) {
-               ret = ERR_PTR(-ENOMEM);
-               goto bail;
-       }
-
-       mr->mr.pd = pd;
-       mr->mr.user_base = *iova_start;
-       mr->mr.iova = *iova_start;
-       mr->mr.length = 0;
-       mr->mr.offset = 0;
-       mr->mr.access_flags = acc;
-       mr->mr.max_segs = num_phys_buf;
-       mr->umem = NULL;
-
-       m = 0;
-       n = 0;
-       for (i = 0; i < num_phys_buf; i++) {
-               mr->mr.map[m]->segs[n].vaddr = (void *) buffer_list[i].addr;
-               mr->mr.map[m]->segs[n].length = buffer_list[i].size;
-               mr->mr.length += buffer_list[i].size;
-               n++;
-               if (n == IPATH_SEGSZ) {
-                       m++;
-                       n = 0;
-               }
-       }
-
-       ret = &mr->ibmr;
-
-bail:
-       return ret;
-}
-
 /**
  * ipath_reg_user_mr - register a userspace memory region
  * @pd: protection domain for this memory region
index 1778dee13f99269c51cc90f6c4d09a6f46a13a21..53f9dcab180d2e39b1eafcf5d7282be93cef5357 100644 (file)
@@ -2201,7 +2201,6 @@ int ipath_register_ib_device(struct ipath_devdata *dd)
        dev->poll_cq = ipath_poll_cq;
        dev->req_notify_cq = ipath_req_notify_cq;
        dev->get_dma_mr = ipath_get_dma_mr;
-       dev->reg_phys_mr = ipath_reg_phys_mr;
        dev->reg_user_mr = ipath_reg_user_mr;
        dev->dereg_mr = ipath_dereg_mr;
        dev->alloc_fmr = ipath_alloc_fmr;
index 0a90a56870ab0653022b098d2eda6df510b59ad0..6c70a89667a97aeeff7d3289255b001bdda0515b 100644 (file)
@@ -828,10 +828,6 @@ int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
 
 struct ib_mr *ipath_get_dma_mr(struct ib_pd *pd, int acc);
 
-struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd,
-                               struct ib_phys_buf *buffer_list,
-                               int num_phys_buf, int acc, u64 *iova_start);
-
 struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
                                u64 virt_addr, int mr_access_flags,
                                struct ib_udata *udata);
index e77d15000caa313a8c3c8bb4a403ba8f1aa9a6c6..5a2899f9f50b6e4b88730f9db76bf085f97b2b88 100644 (file)
@@ -615,9 +615,9 @@ iblock_alloc_bip(struct se_cmd *cmd, struct bio *bio)
        }
 
        bip = bio_integrity_alloc(bio, GFP_NOIO, cmd->t_prot_nents);
-       if (!bip) {
+       if (IS_ERR(bip)) {
                pr_err("Unable to allocate bio_integrity_payload\n");
-               return -ENOMEM;
+               return PTR_ERR(bip);
        }
 
        bip->bip_iter.bi_size = (cmd->data_length / dev->dev_attrib.block_size) *
index ccc0ad02d06698108ec486d11f7b0dc69c7c734a..36fa724a36c851d463afa53a67d1a8526efb75dc 100644 (file)
 /* Braswell thermal reporting device */
 #define PCI_DEVICE_ID_PROC_BSW_THERMAL 0x22DC
 
+/* Broxton thermal reporting device */
+#define PCI_DEVICE_ID_PROC_BXT0_THERMAL  0x0A8C
+#define PCI_DEVICE_ID_PROC_BXT1_THERMAL  0x1A8C
+#define PCI_DEVICE_ID_PROC_BXTX_THERMAL  0x4A8C
+#define PCI_DEVICE_ID_PROC_BXTP_THERMAL  0x5A8C
+
 struct power_config {
        u32     index;
        u32     min_uw;
@@ -404,6 +410,10 @@ static const struct pci_device_id proc_thermal_pci_ids[] = {
        { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_HSB_THERMAL)},
        { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_SKL_THERMAL)},
        { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_BSW_THERMAL)},
+       { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_BXT0_THERMAL)},
+       { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_BXT1_THERMAL)},
+       { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_BXTX_THERMAL)},
+       { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_BXTP_THERMAL)},
        { 0, },
 };
 
index 50c7da79be830a43b342999ca80ef264431e0078..00d81af648b8ed93a9d7c2e09aef8979f7d0606d 100644 (file)
@@ -136,7 +136,7 @@ struct pch_dev_ops {
 
 
 /* dev ops for Wildcat Point */
-static struct pch_dev_ops pch_dev_ops_wpt = {
+static const struct pch_dev_ops pch_dev_ops_wpt = {
        .hw_init = pch_wpt_init,
        .get_temp = pch_wpt_get_temp,
 };
index 13d01edc7a043812611719bf7a342260f09ababa..44b9c485157d8c6e624548ee7c7cfccacea241d9 100644 (file)
@@ -75,11 +75,11 @@ struct rcar_thermal_priv {
 #define rcar_has_irq_support(priv)     ((priv)->common->base)
 #define rcar_id_to_shift(priv)         ((priv)->id * 8)
 
-#ifdef DEBUG
-# define rcar_force_update_temp(priv)  1
-#else
-# define rcar_force_update_temp(priv)  0
-#endif
+static const struct of_device_id rcar_thermal_dt_ids[] = {
+       { .compatible = "renesas,rcar-thermal", },
+       {},
+};
+MODULE_DEVICE_TABLE(of, rcar_thermal_dt_ids);
 
 /*
  *             basic functions
@@ -203,14 +203,26 @@ err_out_unlock:
 static int rcar_thermal_get_temp(struct thermal_zone_device *zone, int *temp)
 {
        struct rcar_thermal_priv *priv = rcar_zone_to_priv(zone);
+       int tmp;
+       int ret;
 
-       if (!rcar_has_irq_support(priv) || rcar_force_update_temp(priv))
-               rcar_thermal_update_temp(priv);
+       ret = rcar_thermal_update_temp(priv);
+       if (ret < 0)
+               return ret;
 
        mutex_lock(&priv->lock);
-       *temp =  MCELSIUS((priv->ctemp * 5) - 65);
+       tmp =  MCELSIUS((priv->ctemp * 5) - 65);
        mutex_unlock(&priv->lock);
 
+       if ((tmp < MCELSIUS(-45)) || (tmp > MCELSIUS(125))) {
+               struct device *dev = rcar_priv_to_dev(priv);
+
+               dev_err(dev, "it couldn't measure temperature correctly\n");
+               return -EIO;
+       }
+
+       *temp = tmp;
+
        return 0;
 }
 
@@ -288,6 +300,9 @@ static void _rcar_thermal_irq_ctrl(struct rcar_thermal_priv *priv, int enable)
        unsigned long flags;
        u32 mask = 0x3 << rcar_id_to_shift(priv); /* enable Rising/Falling */
 
+       if (!rcar_has_irq_support(priv))
+               return;
+
        spin_lock_irqsave(&common->lock, flags);
 
        rcar_thermal_common_bset(common, INTMSK, mask, enable ? 0 : mask);
@@ -299,11 +314,15 @@ static void rcar_thermal_work(struct work_struct *work)
 {
        struct rcar_thermal_priv *priv;
        int cctemp, nctemp;
+       int ret;
 
        priv = container_of(work, struct rcar_thermal_priv, work.work);
 
        rcar_thermal_get_temp(priv->zone, &cctemp);
-       rcar_thermal_update_temp(priv);
+       ret = rcar_thermal_update_temp(priv);
+       if (ret < 0)
+               return;
+
        rcar_thermal_irq_enable(priv);
 
        rcar_thermal_get_temp(priv->zone, &nctemp);
@@ -368,8 +387,7 @@ static int rcar_thermal_remove(struct platform_device *pdev)
        struct rcar_thermal_priv *priv;
 
        rcar_thermal_for_each_priv(priv, common) {
-               if (rcar_has_irq_support(priv))
-                       rcar_thermal_irq_disable(priv);
+               rcar_thermal_irq_disable(priv);
                thermal_zone_device_unregister(priv->zone);
        }
 
@@ -441,7 +459,9 @@ static int rcar_thermal_probe(struct platform_device *pdev)
                mutex_init(&priv->lock);
                INIT_LIST_HEAD(&priv->list);
                INIT_DELAYED_WORK(&priv->work, rcar_thermal_work);
-               rcar_thermal_update_temp(priv);
+               ret = rcar_thermal_update_temp(priv);
+               if (ret < 0)
+                       goto error_unregister;
 
                priv->zone = thermal_zone_device_register("rcar_thermal",
                                                1, 0, priv,
@@ -453,8 +473,7 @@ static int rcar_thermal_probe(struct platform_device *pdev)
                        goto error_unregister;
                }
 
-               if (rcar_has_irq_support(priv))
-                       rcar_thermal_irq_enable(priv);
+               rcar_thermal_irq_enable(priv);
 
                list_move_tail(&priv->list, &common->head);
 
@@ -484,12 +503,6 @@ error_unregister:
        return ret;
 }
 
-static const struct of_device_id rcar_thermal_dt_ids[] = {
-       { .compatible = "renesas,rcar-thermal", },
-       {},
-};
-MODULE_DEVICE_TABLE(of, rcar_thermal_dt_ids);
-
 static struct platform_driver rcar_thermal_driver = {
        .driver = {
                .name   = "rcar_thermal",
index e845841ab036cc82033d0a6829058a0f15f2543e..b58e3fb9b31186921e51edd0fa70bfe0e8c88059 100644 (file)
@@ -38,7 +38,7 @@ enum tshut_mode {
 };
 
 /**
- * the system Temperature Sensors tshut(tshut) polarity
+ * The system Temperature Sensors tshut(tshut) polarity
  * the bit 8 is tshut polarity.
  * 0: low active, 1: high active
  */
@@ -57,10 +57,10 @@ enum sensor_id {
 };
 
 /**
-* The conversion table has the adc value and temperature.
-* ADC_DECREMENT is the adc value decremnet.(e.g. v2_code_table)
-* ADC_INCREMNET is the adc value incremnet.(e.g. v3_code_table)
-*/
+ * The conversion table has the adc value and temperature.
+ * ADC_DECREMENT: the adc value is of diminishing.(e.g. v2_code_table)
+ * ADC_INCREMENT: the adc value is incremental.(e.g. v3_code_table)
+ */
 enum adc_sort_mode {
        ADC_DECREMENT = 0,
        ADC_INCREMENT,
@@ -72,16 +72,17 @@ enum adc_sort_mode {
  */
 #define SOC_MAX_SENSORS        2
 
+/**
+ * struct chip_tsadc_table: hold information about chip-specific differences
+ * @id: conversion table
+ * @length: size of conversion table
+ * @data_mask: mask to apply on data inputs
+ * @mode: sort mode of this adc variant (incrementing or decrementing)
+ */
 struct chip_tsadc_table {
        const struct tsadc_table *id;
-
-       /* the array table size*/
        unsigned int length;
-
-       /* that analogic mask data */
        u32 data_mask;
-
-       /* the sort mode is adc value that increment or decrement in table */
        enum adc_sort_mode mode;
 };
 
@@ -153,6 +154,7 @@ struct rockchip_thermal_data {
 #define TSADCV2_SHUT_2GPIO_SRC_EN(chn)         BIT(4 + (chn))
 #define TSADCV2_SHUT_2CRU_SRC_EN(chn)          BIT(8 + (chn))
 
+#define TSADCV1_INT_PD_CLEAR_MASK              ~BIT(16)
 #define TSADCV2_INT_PD_CLEAR_MASK              ~BIT(8)
 
 #define TSADCV2_DATA_MASK                      0xfff
@@ -168,6 +170,51 @@ struct tsadc_table {
        int temp;
 };
 
+/**
+ * Note:
+ * Code to Temperature mapping of the Temperature sensor is a piece wise linear
+ * curve.Any temperature, code faling between to 2 give temperatures can be
+ * linearly interpolated.
+ * Code to Temperature mapping should be updated based on sillcon results.
+ */
+static const struct tsadc_table v1_code_table[] = {
+       {TSADCV3_DATA_MASK, -40000},
+       {436, -40000},
+       {431, -35000},
+       {426, -30000},
+       {421, -25000},
+       {416, -20000},
+       {411, -15000},
+       {406, -10000},
+       {401, -5000},
+       {395, 0},
+       {390, 5000},
+       {385, 10000},
+       {380, 15000},
+       {375, 20000},
+       {370, 25000},
+       {364, 30000},
+       {359, 35000},
+       {354, 40000},
+       {349, 45000},
+       {343, 50000},
+       {338, 55000},
+       {333, 60000},
+       {328, 65000},
+       {322, 70000},
+       {317, 75000},
+       {312, 80000},
+       {307, 85000},
+       {301, 90000},
+       {296, 95000},
+       {291, 100000},
+       {286, 105000},
+       {280, 110000},
+       {275, 115000},
+       {270, 120000},
+       {264, 125000},
+};
+
 static const struct tsadc_table v2_code_table[] = {
        {TSADCV2_DATA_MASK, -40000},
        {3800, -40000},
@@ -245,6 +292,44 @@ static const struct tsadc_table v3_code_table[] = {
        {TSADCV3_DATA_MASK, 125000},
 };
 
+static const struct tsadc_table v4_code_table[] = {
+       {TSADCV3_DATA_MASK, -40000},
+       {431, -40000},
+       {426, -35000},
+       {421, -30000},
+       {415, -25000},
+       {410, -20000},
+       {405, -15000},
+       {399, -10000},
+       {394, -5000},
+       {389, 0},
+       {383, 5000},
+       {378, 10000},
+       {373, 15000},
+       {367, 20000},
+       {362, 25000},
+       {357, 30000},
+       {351, 35000},
+       {346, 40000},
+       {340, 45000},
+       {335, 50000},
+       {330, 55000},
+       {324, 60000},
+       {319, 65000},
+       {313, 70000},
+       {308, 75000},
+       {302, 80000},
+       {297, 85000},
+       {291, 90000},
+       {286, 95000},
+       {281, 100000},
+       {275, 105000},
+       {270, 110000},
+       {264, 115000},
+       {259, 120000},
+       {253, 125000},
+};
+
 static u32 rk_tsadcv2_temp_to_code(struct chip_tsadc_table table,
                                   int temp)
 {
@@ -368,6 +453,14 @@ static void rk_tsadcv2_initialize(void __iomem *regs,
                       regs + TSADCV2_HIGHT_TSHUT_DEBOUNCE);
 }
 
+static void rk_tsadcv1_irq_ack(void __iomem *regs)
+{
+       u32 val;
+
+       val = readl_relaxed(regs + TSADCV2_INT_PD);
+       writel_relaxed(val & TSADCV1_INT_PD_CLEAR_MASK, regs + TSADCV2_INT_PD);
+}
+
 static void rk_tsadcv2_irq_ack(void __iomem *regs)
 {
        u32 val;
@@ -429,6 +522,29 @@ static void rk_tsadcv2_tshut_mode(int chn, void __iomem *regs,
        writel_relaxed(val, regs + TSADCV2_INT_EN);
 }
 
+static const struct rockchip_tsadc_chip rk3228_tsadc_data = {
+       .chn_id[SENSOR_CPU] = 0, /* cpu sensor is channel 0 */
+       .chn_num = 1, /* one channel for tsadc */
+
+       .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */
+       .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */
+       .tshut_temp = 95000,
+
+       .initialize = rk_tsadcv2_initialize,
+       .irq_ack = rk_tsadcv1_irq_ack,
+       .control = rk_tsadcv2_control,
+       .get_temp = rk_tsadcv2_get_temp,
+       .set_tshut_temp = rk_tsadcv2_tshut_temp,
+       .set_tshut_mode = rk_tsadcv2_tshut_mode,
+
+       .table = {
+               .id = v1_code_table,
+               .length = ARRAY_SIZE(v1_code_table),
+               .data_mask = TSADCV3_DATA_MASK,
+               .mode = ADC_DECREMENT,
+       },
+};
+
 static const struct rockchip_tsadc_chip rk3288_tsadc_data = {
        .chn_id[SENSOR_CPU] = 1, /* cpu sensor is channel 1 */
        .chn_id[SENSOR_GPU] = 2, /* gpu sensor is channel 2 */
@@ -477,7 +593,35 @@ static const struct rockchip_tsadc_chip rk3368_tsadc_data = {
        },
 };
 
+static const struct rockchip_tsadc_chip rk3399_tsadc_data = {
+       .chn_id[SENSOR_CPU] = 0, /* cpu sensor is channel 0 */
+       .chn_id[SENSOR_GPU] = 1, /* gpu sensor is channel 1 */
+       .chn_num = 2, /* two channels for tsadc */
+
+       .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */
+       .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */
+       .tshut_temp = 95000,
+
+       .initialize = rk_tsadcv2_initialize,
+       .irq_ack = rk_tsadcv1_irq_ack,
+       .control = rk_tsadcv2_control,
+       .get_temp = rk_tsadcv2_get_temp,
+       .set_tshut_temp = rk_tsadcv2_tshut_temp,
+       .set_tshut_mode = rk_tsadcv2_tshut_mode,
+
+       .table = {
+               .id = v4_code_table,
+               .length = ARRAY_SIZE(v4_code_table),
+               .data_mask = TSADCV3_DATA_MASK,
+               .mode = ADC_DECREMENT,
+       },
+};
+
 static const struct of_device_id of_rockchip_thermal_match[] = {
+       {
+               .compatible = "rockchip,rk3228-tsadc",
+               .data = (void *)&rk3228_tsadc_data,
+       },
        {
                .compatible = "rockchip,rk3288-tsadc",
                .data = (void *)&rk3288_tsadc_data,
@@ -486,6 +630,10 @@ static const struct of_device_id of_rockchip_thermal_match[] = {
                .compatible = "rockchip,rk3368-tsadc",
                .data = (void *)&rk3368_tsadc_data,
        },
+       {
+               .compatible = "rockchip,rk3399-tsadc",
+               .data = (void *)&rk3399_tsadc_data,
+       },
        { /* end */ },
 };
 MODULE_DEVICE_TABLE(of, of_rockchip_thermal_match);
@@ -617,7 +765,7 @@ rockchip_thermal_register_sensor(struct platform_device *pdev,
        return 0;
 }
 
-/*
+/**
  * Reset TSADC Controller, reset all tsadc registers.
  */
 static void rockchip_thermal_reset_controller(struct reset_control *reset)
index 2f9f7086ac3dd0d7a3a71015593fffacf8ee586e..ea9366ad3e6bb285e52e368691a0d495cbb3429f 100644 (file)
@@ -63,6 +63,19 @@ static unsigned long get_target_state(struct thermal_instance *instance,
        next_target = instance->target;
        dev_dbg(&cdev->device, "cur_state=%ld\n", cur_state);
 
+       if (!instance->initialized) {
+               if (throttle) {
+                       next_target = (cur_state + 1) >= instance->upper ?
+                                       instance->upper :
+                                       ((cur_state + 1) < instance->lower ?
+                                       instance->lower : (cur_state + 1));
+               } else {
+                       next_target = THERMAL_NO_TARGET;
+               }
+
+               return next_target;
+       }
+
        switch (trend) {
        case THERMAL_TREND_RAISING:
                if (throttle) {
@@ -149,7 +162,7 @@ static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip)
                dev_dbg(&instance->cdev->device, "old_target=%d, target=%d\n",
                                        old_target, (int)instance->target);
 
-               if (old_target == instance->target)
+               if (instance->initialized && old_target == instance->target)
                        continue;
 
                /* Activate a passive thermal instance */
@@ -161,7 +174,7 @@ static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip)
                        instance->target == THERMAL_NO_TARGET)
                        update_passive_instance(tz, trip_type, -1);
 
-
+               instance->initialized = true;
                instance->cdev->updated = false; /* cdev needs update */
        }
 
index d9e525cc9c1ce24bcd9d55dd59730f375b15d552..a0a8fd1235e2a21cbe5553c440b26eafecf8042e 100644 (file)
@@ -37,6 +37,7 @@
 #include <linux/of.h>
 #include <net/netlink.h>
 #include <net/genetlink.h>
+#include <linux/suspend.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/thermal.h>
@@ -59,6 +60,8 @@ static LIST_HEAD(thermal_governor_list);
 static DEFINE_MUTEX(thermal_list_lock);
 static DEFINE_MUTEX(thermal_governor_lock);
 
+static atomic_t in_suspend;
+
 static struct thermal_governor *def_governor;
 
 static struct thermal_governor *__find_governor(const char *name)
@@ -532,14 +535,31 @@ static void update_temperature(struct thermal_zone_device *tz)
        mutex_unlock(&tz->lock);
 
        trace_thermal_temperature(tz);
-       dev_dbg(&tz->device, "last_temperature=%d, current_temperature=%d\n",
-                               tz->last_temperature, tz->temperature);
+       if (tz->last_temperature == THERMAL_TEMP_INVALID)
+               dev_dbg(&tz->device, "last_temperature N/A, current_temperature=%d\n",
+                       tz->temperature);
+       else
+               dev_dbg(&tz->device, "last_temperature=%d, current_temperature=%d\n",
+                       tz->last_temperature, tz->temperature);
+}
+
+static void thermal_zone_device_reset(struct thermal_zone_device *tz)
+{
+       struct thermal_instance *pos;
+
+       tz->temperature = THERMAL_TEMP_INVALID;
+       tz->passive = 0;
+       list_for_each_entry(pos, &tz->thermal_instances, tz_node)
+               pos->initialized = false;
 }
 
 void thermal_zone_device_update(struct thermal_zone_device *tz)
 {
        int count;
 
+       if (atomic_read(&in_suspend))
+               return;
+
        if (!tz->ops->get_temp)
                return;
 
@@ -676,8 +696,12 @@ trip_point_temp_store(struct device *dev, struct device_attribute *attr,
                return -EINVAL;
 
        ret = tz->ops->set_trip_temp(tz, trip, temperature);
+       if (ret)
+               return ret;
 
-       return ret ? ret : count;
+       thermal_zone_device_update(tz);
+
+       return count;
 }
 
 static ssize_t
@@ -1321,6 +1345,7 @@ int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz,
        if (!result) {
                list_add_tail(&dev->tz_node, &tz->thermal_instances);
                list_add_tail(&dev->cdev_node, &cdev->thermal_instances);
+               atomic_set(&tz->need_update, 1);
        }
        mutex_unlock(&cdev->lock);
        mutex_unlock(&tz->lock);
@@ -1430,6 +1455,7 @@ __thermal_cooling_device_register(struct device_node *np,
                                  const struct thermal_cooling_device_ops *ops)
 {
        struct thermal_cooling_device *cdev;
+       struct thermal_zone_device *pos = NULL;
        int result;
 
        if (type && strlen(type) >= THERMAL_NAME_LENGTH)
@@ -1474,6 +1500,12 @@ __thermal_cooling_device_register(struct device_node *np,
        /* Update binding information for 'this' new cdev */
        bind_cdev(cdev);
 
+       mutex_lock(&thermal_list_lock);
+       list_for_each_entry(pos, &thermal_tz_list, node)
+               if (atomic_cmpxchg(&pos->need_update, 1, 0))
+                       thermal_zone_device_update(pos);
+       mutex_unlock(&thermal_list_lock);
+
        return cdev;
 }
 
@@ -1806,6 +1838,8 @@ struct thermal_zone_device *thermal_zone_device_register(const char *type,
        tz->trips = trips;
        tz->passive_delay = passive_delay;
        tz->polling_delay = polling_delay;
+       /* A new thermal zone needs to be updated anyway. */
+       atomic_set(&tz->need_update, 1);
 
        dev_set_name(&tz->device, "thermal_zone%d", tz->id);
        result = device_register(&tz->device);
@@ -1900,7 +1934,10 @@ struct thermal_zone_device *thermal_zone_device_register(const char *type,
 
        INIT_DELAYED_WORK(&(tz->poll_queue), thermal_zone_device_check);
 
-       thermal_zone_device_update(tz);
+       thermal_zone_device_reset(tz);
+       /* Update the new thermal zone and mark it as already updated. */
+       if (atomic_cmpxchg(&tz->need_update, 1, 0))
+               thermal_zone_device_update(tz);
 
        return tz;
 
@@ -2140,6 +2177,36 @@ static void thermal_unregister_governors(void)
        thermal_gov_power_allocator_unregister();
 }
 
+static int thermal_pm_notify(struct notifier_block *nb,
+                               unsigned long mode, void *_unused)
+{
+       struct thermal_zone_device *tz;
+
+       switch (mode) {
+       case PM_HIBERNATION_PREPARE:
+       case PM_RESTORE_PREPARE:
+       case PM_SUSPEND_PREPARE:
+               atomic_set(&in_suspend, 1);
+               break;
+       case PM_POST_HIBERNATION:
+       case PM_POST_RESTORE:
+       case PM_POST_SUSPEND:
+               atomic_set(&in_suspend, 0);
+               list_for_each_entry(tz, &thermal_tz_list, node) {
+                       thermal_zone_device_reset(tz);
+                       thermal_zone_device_update(tz);
+               }
+               break;
+       default:
+               break;
+       }
+       return 0;
+}
+
+static struct notifier_block thermal_pm_nb = {
+       .notifier_call = thermal_pm_notify,
+};
+
 static int __init thermal_init(void)
 {
        int result;
@@ -2160,6 +2227,11 @@ static int __init thermal_init(void)
        if (result)
                goto exit_netlink;
 
+       result = register_pm_notifier(&thermal_pm_nb);
+       if (result)
+               pr_warn("Thermal: Can not register suspend notifier, return %d\n",
+                       result);
+
        return 0;
 
 exit_netlink:
@@ -2179,6 +2251,7 @@ error:
 
 static void __exit thermal_exit(void)
 {
+       unregister_pm_notifier(&thermal_pm_nb);
        of_thermal_destroy_zones();
        genetlink_exit();
        class_unregister(&thermal_class);
index d7ac1fccd6598a80453dc51da655e37b30b85ac7..749d41abfbabecfc8fc33f54f4bc2e5d680450f8 100644 (file)
@@ -41,6 +41,7 @@ struct thermal_instance {
        struct thermal_zone_device *tz;
        struct thermal_cooling_device *cdev;
        int trip;
+       bool initialized;
        unsigned long upper;    /* Highest cooling state for this trip point */
        unsigned long lower;    /* Lowest cooling state for this trip point */
        unsigned long target;   /* expected cooling state */
index 0fbfb2b2aa08b3ade8e4b487d012188687b29e05..26ccad5d8680a3a1b5c75b4bf89018340eeb1846 100644 (file)
@@ -673,7 +673,7 @@ printer_fsync(struct file *fd, loff_t start, loff_t end, int datasync)
        unsigned long           flags;
        int                     tx_list_empty;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        spin_lock_irqsave(&dev->lock, flags);
        tx_list_empty = (likely(list_empty(&dev->tx_reqs)));
        spin_unlock_irqrestore(&dev->lock, flags);
@@ -683,7 +683,7 @@ printer_fsync(struct file *fd, loff_t start, loff_t end, int datasync)
                wait_event_interruptible(dev->tx_flush_wait,
                                (likely(list_empty(&dev->tx_reqs_active))));
        }
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        return 0;
 }
index 365afd7e14f8cade738683a06616b4116fa84fc9..7e179f81d05cae3e91fd8b976a1f2665e3958cc3 100644 (file)
@@ -1521,10 +1521,10 @@ static void destroy_ep_files (struct dev_data *dev)
                spin_unlock_irq (&dev->lock);
 
                /* break link to dcache */
-               mutex_lock (&parent->i_mutex);
+               inode_lock(parent);
                d_delete (dentry);
                dput (dentry);
-               mutex_unlock (&parent->i_mutex);
+               inode_unlock(parent);
 
                spin_lock_irq (&dev->lock);
        }
index f92f5aff0dd5e8ece600cc44b5ba6168e3f1bb81..8755b2c2aadaa60700471f5260408bff6e28b69c 100644 (file)
@@ -91,7 +91,7 @@ static ssize_t queue_dbg_read(struct file *file, char __user *buf,
        if (!access_ok(VERIFY_WRITE, buf, nbytes))
                return -EFAULT;
 
-       mutex_lock(&file_inode(file)->i_mutex);
+       inode_lock(file_inode(file));
        list_for_each_entry_safe(req, tmp_req, queue, queue) {
                len = snprintf(tmpbuf, sizeof(tmpbuf),
                                "%8p %08x %c%c%c %5d %c%c%c\n",
@@ -118,7 +118,7 @@ static ssize_t queue_dbg_read(struct file *file, char __user *buf,
                nbytes -= len;
                buf += len;
        }
-       mutex_unlock(&file_inode(file)->i_mutex);
+       inode_unlock(file_inode(file));
 
        return actual;
 }
@@ -143,7 +143,7 @@ static int regs_dbg_open(struct inode *inode, struct file *file)
        u32 *data;
        int ret = -ENOMEM;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        udc = inode->i_private;
        data = kmalloc(inode->i_size, GFP_KERNEL);
        if (!data)
@@ -158,7 +158,7 @@ static int regs_dbg_open(struct inode *inode, struct file *file)
        ret = 0;
 
 out:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        return ret;
 }
@@ -169,11 +169,11 @@ static ssize_t regs_dbg_read(struct file *file, char __user *buf,
        struct inode *inode = file_inode(file);
        int ret;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        ret = simple_read_from_buffer(buf, nbytes, ppos,
                        file->private_data,
                        file_inode(file)->i_size);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        return ret;
 }
index daa563ff1fa05c9c28e2c8a03876c2310aaf0438..1f117c360ebbba3394acd1b934e43632ac2b3531 100644 (file)
@@ -229,6 +229,8 @@ config USB_EHCI_TEGRA
        depends on ARCH_TEGRA
        select USB_EHCI_ROOT_HUB_TT
        select USB_PHY
+       select USB_ULPI
+       select USB_ULPI_VIEWPORT
        help
          This driver enables support for the internal USB Host Controllers
          found in NVIDIA Tegra SoCs. The controllers are EHCI compliant.
index 3fc63c208d08ba9fa801570d830220106aa24f21..57721c73177fa1184d253155a6bcce39bdb2e927 100644 (file)
@@ -78,13 +78,13 @@ int fb_deferred_io_fsync(struct file *file, loff_t start, loff_t end, int datasy
        if (!info->fbdefio)
                return 0;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        /* Kill off the delayed work */
        cancel_delayed_work_sync(&info->deferred_work);
 
        /* Run it immediately */
        schedule_delayed_work(&info->deferred_work, 0);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        return 0;
 }
index 7bf835f85bc822ef1119b639be82619af066d326..eadc894faea2ead1f720ad22da63a42889967ebe 100644 (file)
@@ -449,14 +449,14 @@ static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end,
        if (retval)
                return retval;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
 
        fid = filp->private_data;
        v9fs_blank_wstat(&wstat);
 
        retval = p9_client_wstat(fid, &wstat);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        return retval;
 }
@@ -472,13 +472,13 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end,
        if (retval)
                return retval;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
 
        fid = filp->private_data;
 
        retval = p9_client_fsync(fid, datasync);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        return retval;
 }
index 659c579c4588b2a5e844d0e82a0195494d70a073..0548c53f41d51902a4e0ce7b983f7cd365f656f9 100644 (file)
@@ -33,11 +33,11 @@ affs_file_release(struct inode *inode, struct file *filp)
                 inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt));
 
        if (atomic_dec_and_test(&AFFS_I(inode)->i_opencnt)) {
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
                if (inode->i_size != AFFS_I(inode)->mmu_private)
                        affs_truncate(inode);
                affs_free_prealloc(inode);
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
        }
 
        return 0;
@@ -958,12 +958,12 @@ int affs_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
        if (err)
                return err;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        ret = write_inode_now(inode, 0);
        err = sync_blockdev(inode->i_sb->s_bdev);
        if (!ret)
                ret = err;
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return ret;
 }
 const struct file_operations affs_file_operations = {
index 4baf1d2b39e410ffb501c2aba457fdceadcc2cdb..d91a9c9cfbd0a9aa0770ad394af0c156057a8a85 100644 (file)
@@ -483,7 +483,7 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl)
 
        fl->fl_type = F_UNLCK;
 
-       mutex_lock(&vnode->vfs_inode.i_mutex);
+       inode_lock(&vnode->vfs_inode);
 
        /* check local lock records first */
        ret = 0;
@@ -505,7 +505,7 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl)
        }
 
 error:
-       mutex_unlock(&vnode->vfs_inode.i_mutex);
+       inode_unlock(&vnode->vfs_inode);
        _leave(" = %d [%hd]", ret, fl->fl_type);
        return ret;
 }
index 0714abcd7f32321754287e46aec129196832e2ef..dfef94f70667cde167dad60d71d1e23622c24cf7 100644 (file)
@@ -693,7 +693,7 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
        ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
        if (ret)
                return ret;
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        /* use a writeback record as a marker in the queue - when this reaches
         * the front of the queue, all the outstanding writes are either
@@ -735,7 +735,7 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
        afs_put_writeback(wb);
        _leave(" = %d", ret);
 out:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return ret;
 }
 
index 6530ced19697d49a9189fa289c9112187448fee3..25b24d0f6c8810c86ef79319e091fc65231733d0 100644 (file)
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -195,7 +195,7 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
        struct timespec now;
        unsigned int ia_valid = attr->ia_valid;
 
-       WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex));
+       WARN_ON_ONCE(!inode_is_locked(inode));
 
        if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) {
                if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
index 78f005f378476011a43c41e7d0eaa8d99ac1af21..3a3ced779fc738199cb5bb3d25a8f8c76edc754d 100644 (file)
@@ -638,11 +638,11 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
        case 3:
                /* Delete this handler. */
                root = dget(file->f_path.dentry->d_sb->s_root);
-               mutex_lock(&d_inode(root)->i_mutex);
+               inode_lock(d_inode(root));
 
                kill_node(e);
 
-               mutex_unlock(&d_inode(root)->i_mutex);
+               inode_unlock(d_inode(root));
                dput(root);
                break;
        default:
@@ -675,7 +675,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
                return PTR_ERR(e);
 
        root = dget(sb->s_root);
-       mutex_lock(&d_inode(root)->i_mutex);
+       inode_lock(d_inode(root));
        dentry = lookup_one_len(e->name, root, strlen(e->name));
        err = PTR_ERR(dentry);
        if (IS_ERR(dentry))
@@ -711,7 +711,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 out2:
        dput(dentry);
 out:
-       mutex_unlock(&d_inode(root)->i_mutex);
+       inode_unlock(d_inode(root));
        dput(root);
 
        if (err) {
@@ -754,12 +754,12 @@ static ssize_t bm_status_write(struct file *file, const char __user *buffer,
        case 3:
                /* Delete all handlers. */
                root = dget(file->f_path.dentry->d_sb->s_root);
-               mutex_lock(&d_inode(root)->i_mutex);
+               inode_lock(d_inode(root));
 
                while (!list_empty(&entries))
                        kill_node(list_entry(entries.next, Node, list));
 
-               mutex_unlock(&d_inode(root)->i_mutex);
+               inode_unlock(d_inode(root));
                dput(root);
                break;
        default:
index ba762ea07f679c9bafd078bb316270a38e9a7d80..7b9cd49622b132f5f71e6557f05d0a2591a37c16 100644 (file)
@@ -75,7 +75,7 @@ void kill_bdev(struct block_device *bdev)
 {
        struct address_space *mapping = bdev->bd_inode->i_mapping;
 
-       if (mapping->nrpages == 0 && mapping->nrshadows == 0)
+       if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
                return;
 
        invalidate_bh_lrus();
@@ -346,9 +346,9 @@ static loff_t block_llseek(struct file *file, loff_t offset, int whence)
        struct inode *bd_inode = bdev_file_inode(file);
        loff_t retval;
 
-       mutex_lock(&bd_inode->i_mutex);
+       inode_lock(bd_inode);
        retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode));
-       mutex_unlock(&bd_inode->i_mutex);
+       inode_unlock(bd_inode);
        return retval;
 }
        
@@ -1142,9 +1142,9 @@ void bd_set_size(struct block_device *bdev, loff_t size)
 {
        unsigned bsize = bdev_logical_block_size(bdev);
 
-       mutex_lock(&bdev->bd_inode->i_mutex);
+       inode_lock(bdev->bd_inode);
        i_size_write(bdev->bd_inode, size);
-       mutex_unlock(&bdev->bd_inode->i_mutex);
+       inode_unlock(bdev->bd_inode);
        while (bsize < PAGE_CACHE_SIZE) {
                if (size & bsize)
                        break;
@@ -1741,9 +1741,9 @@ static void blkdev_vm_open(struct vm_area_struct *vma)
        struct inode *bd_inode = bdev_file_inode(vma->vm_file);
        struct block_device *bdev = I_BDEV(bd_inode);
 
-       mutex_lock(&bd_inode->i_mutex);
+       inode_lock(bd_inode);
        bdev->bd_map_count++;
-       mutex_unlock(&bd_inode->i_mutex);
+       inode_unlock(bd_inode);
 }
 
 static void blkdev_vm_close(struct vm_area_struct *vma)
@@ -1751,9 +1751,9 @@ static void blkdev_vm_close(struct vm_area_struct *vma)
        struct inode *bd_inode = bdev_file_inode(vma->vm_file);
        struct block_device *bdev = I_BDEV(bd_inode);
 
-       mutex_lock(&bd_inode->i_mutex);
+       inode_lock(bd_inode);
        bdev->bd_map_count--;
-       mutex_unlock(&bd_inode->i_mutex);
+       inode_unlock(bd_inode);
 }
 
 static const struct vm_operations_struct blkdev_dax_vm_ops = {
@@ -1777,7 +1777,7 @@ static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
        struct block_device *bdev = I_BDEV(bd_inode);
 
        file_accessed(file);
-       mutex_lock(&bd_inode->i_mutex);
+       inode_lock(bd_inode);
        bdev->bd_map_count++;
        if (IS_DAX(bd_inode)) {
                vma->vm_ops = &blkdev_dax_vm_ops;
@@ -1785,7 +1785,7 @@ static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
        } else {
                vma->vm_ops = &blkdev_default_vm_ops;
        }
-       mutex_unlock(&bd_inode->i_mutex);
+       inode_unlock(bd_inode);
 
        return 0;
 }
index 08405a3da6b1baf9a382327d72dc5a5a85d0c68d..b90cd3776f8e0a41635ce7cf5447e7b9652e6cca 100644 (file)
@@ -560,13 +560,13 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
  */
 static void __merge_refs(struct list_head *head, int mode)
 {
-       struct __prelim_ref *ref1;
+       struct __prelim_ref *pos1;
 
-       list_for_each_entry(ref1, head, list) {
-               struct __prelim_ref *ref2 = ref1, *tmp;
+       list_for_each_entry(pos1, head, list) {
+               struct __prelim_ref *pos2 = pos1, *tmp;
 
-               list_for_each_entry_safe_continue(ref2, tmp, head, list) {
-                       struct __prelim_ref *xchg;
+               list_for_each_entry_safe_continue(pos2, tmp, head, list) {
+                       struct __prelim_ref *xchg, *ref1 = pos1, *ref2 = pos2;
                        struct extent_inode_elem *eie;
 
                        if (!ref_for_same_block(ref1, ref2))
index 97ad9bbeb35d24ec0ad228aa6f040b61bffb33a5..bfe4a337fb4d13a058446265b7baf4a1437aa602 100644 (file)
@@ -1614,7 +1614,7 @@ struct btrfs_fs_info {
 
        spinlock_t delayed_iput_lock;
        struct list_head delayed_iputs;
-       struct rw_semaphore delayed_iput_sem;
+       struct mutex cleaner_delayed_iput_mutex;
 
        /* this protects tree_mod_seq_list */
        spinlock_t tree_mod_seq_lock;
@@ -3641,6 +3641,7 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
 int __get_raid_index(u64 flags);
 int btrfs_start_write_no_snapshoting(struct btrfs_root *root);
 void btrfs_end_write_no_snapshoting(struct btrfs_root *root);
+void btrfs_wait_for_snapshot_creation(struct btrfs_root *root);
 void check_system_chunk(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        const u64 type);
index 1e668fb7dd4c73dbc03b7f642660f2af20bd2ddb..cbb7dbfb3fffffc54f621b8dc43a7b1ec9a1185b 100644 (file)
@@ -614,7 +614,7 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
                em = lookup_extent_mapping(em_tree, start, (u64)-1);
                if (!em)
                        break;
-               map = (struct map_lookup *)em->bdev;
+               map = em->map_lookup;
                for (i = 0; i < map->num_stripes; i++)
                        if (srcdev == map->stripes[i].dev)
                                map->stripes[i].dev = tgtdev;
index e99ccd6ffb2c14f58bf38f548e202f0e0f86e9ea..dd08e29f51177d27859c12f0d173224a24a1f4a9 100644 (file)
 #include <asm/cpufeature.h>
 #endif
 
+#define BTRFS_SUPER_FLAG_SUPP  (BTRFS_HEADER_FLAG_WRITTEN |\
+                                BTRFS_HEADER_FLAG_RELOC |\
+                                BTRFS_SUPER_FLAG_ERROR |\
+                                BTRFS_SUPER_FLAG_SEEDING |\
+                                BTRFS_SUPER_FLAG_METADUMP)
+
 static const struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
 static void free_fs_root(struct btrfs_root *root);
@@ -1583,8 +1589,23 @@ int btrfs_init_fs_root(struct btrfs_root *root)
        ret = get_anon_bdev(&root->anon_dev);
        if (ret)
                goto free_writers;
+
+       mutex_lock(&root->objectid_mutex);
+       ret = btrfs_find_highest_objectid(root,
+                                       &root->highest_objectid);
+       if (ret) {
+               mutex_unlock(&root->objectid_mutex);
+               goto free_root_dev;
+       }
+
+       ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
+
+       mutex_unlock(&root->objectid_mutex);
+
        return 0;
 
+free_root_dev:
+       free_anon_bdev(root->anon_dev);
 free_writers:
        btrfs_free_subvolume_writers(root->subv_writers);
 fail:
@@ -1786,7 +1807,10 @@ static int cleaner_kthread(void *arg)
                        goto sleep;
                }
 
+               mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex);
                btrfs_run_delayed_iputs(root);
+               mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex);
+
                again = btrfs_clean_one_deleted_snapshot(root);
                mutex_unlock(&root->fs_info->cleaner_mutex);
 
@@ -2556,8 +2580,8 @@ int open_ctree(struct super_block *sb,
        mutex_init(&fs_info->delete_unused_bgs_mutex);
        mutex_init(&fs_info->reloc_mutex);
        mutex_init(&fs_info->delalloc_root_mutex);
+       mutex_init(&fs_info->cleaner_delayed_iput_mutex);
        seqlock_init(&fs_info->profiles_lock);
-       init_rwsem(&fs_info->delayed_iput_sem);
 
        INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
        INIT_LIST_HEAD(&fs_info->space_info);
@@ -2742,26 +2766,6 @@ int open_ctree(struct super_block *sb,
                goto fail_alloc;
        }
 
-       /*
-        * Leafsize and nodesize were always equal, this is only a sanity check.
-        */
-       if (le32_to_cpu(disk_super->__unused_leafsize) !=
-           btrfs_super_nodesize(disk_super)) {
-               printk(KERN_ERR "BTRFS: couldn't mount because metadata "
-                      "blocksizes don't match.  node %d leaf %d\n",
-                      btrfs_super_nodesize(disk_super),
-                      le32_to_cpu(disk_super->__unused_leafsize));
-               err = -EINVAL;
-               goto fail_alloc;
-       }
-       if (btrfs_super_nodesize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) {
-               printk(KERN_ERR "BTRFS: couldn't mount because metadata "
-                      "blocksize (%d) was too large\n",
-                      btrfs_super_nodesize(disk_super));
-               err = -EINVAL;
-               goto fail_alloc;
-       }
-
        features = btrfs_super_incompat_flags(disk_super);
        features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
        if (tree_root->fs_info->compress_type == BTRFS_COMPRESS_LZO)
@@ -2833,17 +2837,6 @@ int open_ctree(struct super_block *sb,
        sb->s_blocksize = sectorsize;
        sb->s_blocksize_bits = blksize_bits(sectorsize);
 
-       if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
-               printk(KERN_ERR "BTRFS: valid FS not found on %s\n", sb->s_id);
-               goto fail_sb_buffer;
-       }
-
-       if (sectorsize != PAGE_SIZE) {
-               printk(KERN_ERR "BTRFS: incompatible sector size (%lu) "
-                      "found on %s\n", (unsigned long)sectorsize, sb->s_id);
-               goto fail_sb_buffer;
-       }
-
        mutex_lock(&fs_info->chunk_mutex);
        ret = btrfs_read_sys_array(tree_root);
        mutex_unlock(&fs_info->chunk_mutex);
@@ -2915,6 +2908,18 @@ retry_root_backup:
        tree_root->commit_root = btrfs_root_node(tree_root);
        btrfs_set_root_refs(&tree_root->root_item, 1);
 
+       mutex_lock(&tree_root->objectid_mutex);
+       ret = btrfs_find_highest_objectid(tree_root,
+                                       &tree_root->highest_objectid);
+       if (ret) {
+               mutex_unlock(&tree_root->objectid_mutex);
+               goto recovery_tree_root;
+       }
+
+       ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
+
+       mutex_unlock(&tree_root->objectid_mutex);
+
        ret = btrfs_read_roots(fs_info, tree_root);
        if (ret)
                goto recovery_tree_root;
@@ -4018,8 +4023,17 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
                              int read_only)
 {
        struct btrfs_super_block *sb = fs_info->super_copy;
+       u64 nodesize = btrfs_super_nodesize(sb);
+       u64 sectorsize = btrfs_super_sectorsize(sb);
        int ret = 0;
 
+       if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
+               printk(KERN_ERR "BTRFS: no valid FS found\n");
+               ret = -EINVAL;
+       }
+       if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP)
+               printk(KERN_WARNING "BTRFS: unrecognized super flag: %llu\n",
+                               btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
        if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
                printk(KERN_ERR "BTRFS: tree_root level too big: %d >= %d\n",
                                btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
@@ -4037,31 +4051,46 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
        }
 
        /*
-        * The common minimum, we don't know if we can trust the nodesize/sectorsize
-        * items yet, they'll be verified later. Issue just a warning.
+        * Check sectorsize and nodesize first, other check will need it.
+        * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
         */
-       if (!IS_ALIGNED(btrfs_super_root(sb), 4096))
+       if (!is_power_of_2(sectorsize) || sectorsize < 4096 ||
+           sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
+               printk(KERN_ERR "BTRFS: invalid sectorsize %llu\n", sectorsize);
+               ret = -EINVAL;
+       }
+       /* Only PAGE SIZE is supported yet */
+       if (sectorsize != PAGE_CACHE_SIZE) {
+               printk(KERN_ERR "BTRFS: sectorsize %llu not supported yet, only support %lu\n",
+                               sectorsize, PAGE_CACHE_SIZE);
+               ret = -EINVAL;
+       }
+       if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
+           nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
+               printk(KERN_ERR "BTRFS: invalid nodesize %llu\n", nodesize);
+               ret = -EINVAL;
+       }
+       if (nodesize != le32_to_cpu(sb->__unused_leafsize)) {
+               printk(KERN_ERR "BTRFS: invalid leafsize %u, should be %llu\n",
+                               le32_to_cpu(sb->__unused_leafsize),
+                               nodesize);
+               ret = -EINVAL;
+       }
+
+       /* Root alignment check */
+       if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) {
                printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
                                btrfs_super_root(sb));
-       if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096))
+               ret = -EINVAL;
+       }
+       if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) {
                printk(KERN_WARNING "BTRFS: chunk_root block unaligned: %llu\n",
                                btrfs_super_chunk_root(sb));
-       if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096))
-               printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n",
-                               btrfs_super_log_root(sb));
-
-       /*
-        * Check the lower bound, the alignment and other constraints are
-        * checked later.
-        */
-       if (btrfs_super_nodesize(sb) < 4096) {
-               printk(KERN_ERR "BTRFS: nodesize too small: %u < 4096\n",
-                               btrfs_super_nodesize(sb));
                ret = -EINVAL;
        }
-       if (btrfs_super_sectorsize(sb) < 4096) {
-               printk(KERN_ERR "BTRFS: sectorsize too small: %u < 4096\n",
-                               btrfs_super_sectorsize(sb));
+       if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) {
+               printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n",
+                               btrfs_super_log_root(sb));
                ret = -EINVAL;
        }
 
index 60cc1399c64f9de26ab9fa6efcaf3e8b0e067829..e2287c7c10bebbbb6dc68f0c4aed056a97bdaafe 100644 (file)
@@ -4139,8 +4139,10 @@ commit_trans:
                    !atomic_read(&root->fs_info->open_ioctl_trans)) {
                        need_commit--;
 
-                       if (need_commit > 0)
+                       if (need_commit > 0) {
+                               btrfs_start_delalloc_roots(fs_info, 0, -1);
                                btrfs_wait_ordered_roots(fs_info, -1);
+                       }
 
                        trans = btrfs_join_transaction(root);
                        if (IS_ERR(trans))
@@ -4153,11 +4155,12 @@ commit_trans:
                                if (ret)
                                        return ret;
                                /*
-                                * make sure that all running delayed iput are
-                                * done
+                                * The cleaner kthread might still be doing iput
+                                * operations. Wait for it to finish so that
+                                * more space is released.
                                 */
-                               down_write(&root->fs_info->delayed_iput_sem);
-                               up_write(&root->fs_info->delayed_iput_sem);
+                               mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex);
+                               mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex);
                                goto again;
                        } else {
                                btrfs_end_transaction(trans, root);
@@ -10399,7 +10402,7 @@ btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
         * more device items and remove one chunk item), but this is done at
         * btrfs_remove_chunk() through a call to check_system_chunk().
         */
-       map = (struct map_lookup *)em->bdev;
+       map = em->map_lookup;
        num_items = 3 + map->num_stripes;
        free_extent_map(em);
 
@@ -10586,7 +10589,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
 
        disk_super = fs_info->super_copy;
        if (!btrfs_super_root(disk_super))
-               return 1;
+               return -EINVAL;
 
        features = btrfs_super_incompat_flags(disk_super);
        if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
@@ -10816,3 +10819,23 @@ int btrfs_start_write_no_snapshoting(struct btrfs_root *root)
        }
        return 1;
 }
+
+static int wait_snapshoting_atomic_t(atomic_t *a)
+{
+       schedule();
+       return 0;
+}
+
+void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
+{
+       while (true) {
+               int ret;
+
+               ret = btrfs_start_write_no_snapshoting(root);
+               if (ret)
+                       break;
+               wait_on_atomic_t(&root->will_be_snapshoted,
+                                wait_snapshoting_atomic_t,
+                                TASK_UNINTERRUPTIBLE);
+       }
+}
index 6a98bddd8f33a96e82d0fe856763e53b67e9b064..84fb56d5c018dbfea2a4b1970e4275ada5daa0fb 100644 (file)
@@ -76,7 +76,7 @@ void free_extent_map(struct extent_map *em)
                WARN_ON(extent_map_in_tree(em));
                WARN_ON(!list_empty(&em->list));
                if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
-                       kfree(em->bdev);
+                       kfree(em->map_lookup);
                kmem_cache_free(extent_map_cache, em);
        }
 }
index b2991fd8583efe8ed92de32904eb510542bc1f66..eb8b8fae036bc3c67ceea03220cdca503626546f 100644 (file)
@@ -32,7 +32,15 @@ struct extent_map {
        u64 block_len;
        u64 generation;
        unsigned long flags;
-       struct block_device *bdev;
+       union {
+               struct block_device *bdev;
+
+               /*
+                * used for chunk mappings
+                * flags & EXTENT_FLAG_FS_MAPPING must be set
+                */
+               struct map_lookup *map_lookup;
+       };
        atomic_t refs;
        unsigned int compress_type;
        struct list_head list;
index 83d7859d76199d96ec882c35c87dea98526d229e..098bb8f690c992e1ebd01270f49ff2d37e6658bd 100644 (file)
@@ -406,8 +406,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
 /* simple helper to fault in pages and copy.  This should go away
  * and be replaced with calls into generic code.
  */
-static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
-                                        size_t write_bytes,
+static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
                                         struct page **prepared_pages,
                                         struct iov_iter *i)
 {
@@ -1588,8 +1587,7 @@ again:
                        ret = 0;
                }
 
-               copied = btrfs_copy_from_user(pos, num_pages,
-                                          write_bytes, pages, i);
+               copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
 
                /*
                 * if we have trouble faulting in the pages, fall
@@ -1764,17 +1762,17 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
        loff_t pos;
        size_t count;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        err = generic_write_checks(iocb, from);
        if (err <= 0) {
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
                return err;
        }
 
        current->backing_dev_info = inode_to_bdi(inode);
        err = file_remove_privs(file);
        if (err) {
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
                goto out;
        }
 
@@ -1785,7 +1783,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
         * to stop this write operation to ensure FS consistency.
         */
        if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
                err = -EROFS;
                goto out;
        }
@@ -1806,7 +1804,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
                end_pos = round_up(pos + count, root->sectorsize);
                err = btrfs_cont_expand(inode, i_size_read(inode), end_pos);
                if (err) {
-                       mutex_unlock(&inode->i_mutex);
+                       inode_unlock(inode);
                        goto out;
                }
        }
@@ -1822,7 +1820,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
                        iocb->ki_pos = pos + num_written;
        }
 
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        /*
         * We also have to set last_sub_trans to the current log transid,
@@ -1911,7 +1909,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        if (ret)
                return ret;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        atomic_inc(&root->log_batch);
        full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
                             &BTRFS_I(inode)->runtime_flags);
@@ -1963,7 +1961,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
                ret = start_ordered_ops(inode, start, end);
        }
        if (ret) {
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
                goto out;
        }
        atomic_inc(&root->log_batch);
@@ -2009,7 +2007,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
                 */
                clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
                          &BTRFS_I(inode)->runtime_flags);
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
                goto out;
        }
 
@@ -2033,7 +2031,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        trans = btrfs_start_transaction(root, 0);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
                goto out;
        }
        trans->sync = true;
@@ -2056,7 +2054,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
         * file again, but that will end up using the synchronization
         * inside btrfs_sync_log to keep things safe.
         */
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        /*
         * If any of the ordered extents had an error, just return it to user
@@ -2305,7 +2303,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
        if (ret)
                return ret;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
        ret = find_first_non_hole(inode, &offset, &len);
        if (ret < 0)
@@ -2345,7 +2343,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
                truncated_page = true;
                ret = btrfs_truncate_page(inode, offset, 0, 0);
                if (ret) {
-                       mutex_unlock(&inode->i_mutex);
+                       inode_unlock(inode);
                        return ret;
                }
        }
@@ -2421,7 +2419,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
                ret = btrfs_wait_ordered_range(inode, lockstart,
                                               lockend - lockstart + 1);
                if (ret) {
-                       mutex_unlock(&inode->i_mutex);
+                       inode_unlock(inode);
                        return ret;
                }
        }
@@ -2576,7 +2574,7 @@ out_only_mutex:
                        ret = btrfs_end_transaction(trans, root);
                }
        }
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        if (ret && !err)
                err = ret;
        return err;
@@ -2660,7 +2658,7 @@ static long btrfs_fallocate(struct file *file, int mode,
        if (ret < 0)
                return ret;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        ret = inode_newsize_ok(inode, alloc_end);
        if (ret)
                goto out;
@@ -2818,7 +2816,7 @@ out:
         * So this is completely used as cleanup.
         */
        btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        /* Let go of our reservation. */
        btrfs_free_reserved_data_space(inode, alloc_start,
                                       alloc_end - alloc_start);
@@ -2894,7 +2892,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
        struct inode *inode = file->f_mapping->host;
        int ret;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        switch (whence) {
        case SEEK_END:
        case SEEK_CUR:
@@ -2903,20 +2901,20 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
        case SEEK_DATA:
        case SEEK_HOLE:
                if (offset >= i_size_read(inode)) {
-                       mutex_unlock(&inode->i_mutex);
+                       inode_unlock(inode);
                        return -ENXIO;
                }
 
                ret = find_desired_extent(inode, &offset, whence);
                if (ret) {
-                       mutex_unlock(&inode->i_mutex);
+                       inode_unlock(inode);
                        return ret;
                }
        }
 
        offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
 out:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return offset;
 }
 
index 8b57c17b3fb3194bfff6ae1e487fe0fa7a4fa887..e50316c4af157183d3862fac8e318abe7eb66ecd 100644 (file)
@@ -515,7 +515,7 @@ out:
        return ret;
 }
 
-static int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
+int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
 {
        struct btrfs_path *path;
        int ret;
@@ -555,13 +555,6 @@ int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid)
        int ret;
        mutex_lock(&root->objectid_mutex);
 
-       if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) {
-               ret = btrfs_find_highest_objectid(root,
-                                                 &root->highest_objectid);
-               if (ret)
-                       goto out;
-       }
-
        if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
                ret = -ENOSPC;
                goto out;
index ddb347bfee232ec26543055fbf408bfb92f8028f..c8e864b2d530f88c7e65350ca557e36e6061700e 100644 (file)
@@ -9,5 +9,6 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
                         struct btrfs_trans_handle *trans);
 
 int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid);
+int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid);
 
 #endif
index 24783010768680bea28f4f0e5e9aaa2c6a62a41c..e28f3d4691afee8bf6dcd68d0617a27e07439e82 100644 (file)
@@ -3134,7 +3134,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
 
-       down_read(&fs_info->delayed_iput_sem);
        spin_lock(&fs_info->delayed_iput_lock);
        while (!list_empty(&fs_info->delayed_iputs)) {
                struct btrfs_inode *inode;
@@ -3153,7 +3152,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
                spin_lock(&fs_info->delayed_iput_lock);
        }
        spin_unlock(&fs_info->delayed_iput_lock);
-       up_read(&root->fs_info->delayed_iput_sem);
 }
 
 /*
@@ -4874,26 +4872,6 @@ next:
        return err;
 }
 
-static int wait_snapshoting_atomic_t(atomic_t *a)
-{
-       schedule();
-       return 0;
-}
-
-static void wait_for_snapshot_creation(struct btrfs_root *root)
-{
-       while (true) {
-               int ret;
-
-               ret = btrfs_start_write_no_snapshoting(root);
-               if (ret)
-                       break;
-               wait_on_atomic_t(&root->will_be_snapshoted,
-                                wait_snapshoting_atomic_t,
-                                TASK_UNINTERRUPTIBLE);
-       }
-}
-
 static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4925,7 +4903,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
                 * truncation, it must capture all writes that happened before
                 * this truncation.
                 */
-               wait_for_snapshot_creation(root);
+               btrfs_wait_for_snapshot_creation(root);
                ret = btrfs_cont_expand(inode, oldsize, newsize);
                if (ret) {
                        btrfs_end_write_no_snapshoting(root);
@@ -8469,7 +8447,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
                 * not unlock the i_mutex at this case.
                 */
                if (offset + count <= inode->i_size) {
-                       mutex_unlock(&inode->i_mutex);
+                       inode_unlock(inode);
                        relock = true;
                }
                ret = btrfs_delalloc_reserve_space(inode, offset, count);
@@ -8526,7 +8504,7 @@ out:
        if (wakeup)
                inode_dio_end(inode);
        if (relock)
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
 
        return ret;
 }
index 2a47a3148ec80df57150e3f5aa7d321abb8d1ccd..952172ca7e455633c28a79292d18ebbfd68c4d18 100644 (file)
@@ -240,7 +240,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
        if (ret)
                return ret;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        ip_oldflags = ip->flags;
        i_oldflags = inode->i_flags;
@@ -358,7 +358,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
        }
 
  out_unlock:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        mnt_drop_write_file(file);
        return ret;
 }
@@ -568,6 +568,10 @@ static noinline int create_subvol(struct inode *dir,
                goto fail;
        }
 
+       mutex_lock(&new_root->objectid_mutex);
+       new_root->highest_objectid = new_dirid;
+       mutex_unlock(&new_root->objectid_mutex);
+
        /*
         * insert the directory item
         */
@@ -877,7 +881,7 @@ out_up_read:
 out_dput:
        dput(dentry);
 out_unlock:
-       mutex_unlock(&dir->i_mutex);
+       inode_unlock(dir);
        return error;
 }
 
@@ -1389,18 +1393,18 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
                        ra_index += cluster;
                }
 
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
                if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
                        BTRFS_I(inode)->force_compress = compress_type;
                ret = cluster_pages_for_defrag(inode, pages, i, cluster);
                if (ret < 0) {
-                       mutex_unlock(&inode->i_mutex);
+                       inode_unlock(inode);
                        goto out_ra;
                }
 
                defrag_count += ret;
                balance_dirty_pages_ratelimited(inode->i_mapping);
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
 
                if (newer_than) {
                        if (newer_off == (u64)-1)
@@ -1461,9 +1465,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 
 out_ra:
        if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
                BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
        }
        if (!file)
                kfree(ra);
@@ -2426,7 +2430,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
                goto out_dput;
        }
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        /*
         * Don't allow to delete a subvolume with send in progress. This is
@@ -2539,7 +2543,7 @@ out_up_write:
                spin_unlock(&dest->root_item_lock);
        }
 out_unlock_inode:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        if (!err) {
                d_invalidate(dentry);
                btrfs_invalidate_inodes(dest);
@@ -2555,7 +2559,7 @@ out_unlock_inode:
 out_dput:
        dput(dentry);
 out_unlock_dir:
-       mutex_unlock(&dir->i_mutex);
+       inode_unlock(dir);
 out_drop_write:
        mnt_drop_write_file(file);
 out:
@@ -2853,8 +2857,8 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
 
 static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2)
 {
-       mutex_unlock(&inode1->i_mutex);
-       mutex_unlock(&inode2->i_mutex);
+       inode_unlock(inode1);
+       inode_unlock(inode2);
 }
 
 static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2)
@@ -2862,8 +2866,8 @@ static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2)
        if (inode1 < inode2)
                swap(inode1, inode2);
 
-       mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
-       mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+       inode_lock_nested(inode1, I_MUTEX_PARENT);
+       inode_lock_nested(inode2, I_MUTEX_CHILD);
 }
 
 static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
@@ -3022,7 +3026,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
                return 0;
 
        if (same_inode) {
-               mutex_lock(&src->i_mutex);
+               inode_lock(src);
 
                ret = extent_same_check_offsets(src, loff, &len, olen);
                if (ret)
@@ -3097,7 +3101,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
        btrfs_cmp_data_free(&cmp);
 out_unlock:
        if (same_inode)
-               mutex_unlock(&src->i_mutex);
+               inode_unlock(src);
        else
                btrfs_double_inode_unlock(src, dst);
 
@@ -3745,7 +3749,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
        if (!same_inode) {
                btrfs_double_inode_lock(src, inode);
        } else {
-               mutex_lock(&src->i_mutex);
+               inode_lock(src);
        }
 
        /* determine range to clone */
@@ -3816,7 +3820,7 @@ out_unlock:
        if (!same_inode)
                btrfs_double_inode_unlock(src, inode);
        else
-               mutex_unlock(&src->i_mutex);
+               inode_unlock(src);
        return ret;
 }
 
index 6d707545f775119e4883378636b5255a9dcc9144..55161369fab14d7d2381c2c3950a576c895be2e8 100644 (file)
@@ -609,13 +609,28 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
        return 1;
 }
 
+static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe,
+                                 int index)
+{
+       return stripe * rbio->stripe_npages + index;
+}
+
+/*
+ * these are just the pages from the rbio array, not from anything
+ * the FS sent down to us
+ */
+static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe,
+                                    int index)
+{
+       return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)];
+}
+
 /*
  * helper to index into the pstripe
  */
 static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
 {
-       index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
-       return rbio->stripe_pages[index];
+       return rbio_stripe_page(rbio, rbio->nr_data, index);
 }
 
 /*
@@ -626,10 +641,7 @@ static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
 {
        if (rbio->nr_data + 1 == rbio->real_stripes)
                return NULL;
-
-       index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
-               PAGE_CACHE_SHIFT;
-       return rbio->stripe_pages[index];
+       return rbio_stripe_page(rbio, rbio->nr_data + 1, index);
 }
 
 /*
@@ -889,6 +901,7 @@ static void raid_write_end_io(struct bio *bio)
 {
        struct btrfs_raid_bio *rbio = bio->bi_private;
        int err = bio->bi_error;
+       int max_errors;
 
        if (err)
                fail_bio_stripe(rbio, bio);
@@ -901,7 +914,9 @@ static void raid_write_end_io(struct bio *bio)
        err = 0;
 
        /* OK, we have read all the stripes we need to. */
-       if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+       max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
+                    0 : rbio->bbio->max_errors;
+       if (atomic_read(&rbio->error) > max_errors)
                err = -EIO;
 
        rbio_orig_end_io(rbio, err);
@@ -947,8 +962,7 @@ static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
  */
 static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
 {
-       unsigned long nr = stripe_len * nr_stripes;
-       return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE);
+       return DIV_ROUND_UP(stripe_len, PAGE_CACHE_SIZE) * nr_stripes;
 }
 
 /*
@@ -966,8 +980,8 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
        void *p;
 
        rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 +
-                      DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8),
-                       GFP_NOFS);
+                      DIV_ROUND_UP(stripe_npages, BITS_PER_LONG) *
+                      sizeof(long), GFP_NOFS);
        if (!rbio)
                return ERR_PTR(-ENOMEM);
 
@@ -1021,18 +1035,17 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
                if (!page)
                        return -ENOMEM;
                rbio->stripe_pages[i] = page;
-               ClearPageUptodate(page);
        }
        return 0;
 }
 
-/* allocate pages for just the p/q stripes */
+/* only allocate pages for p/q stripes */
 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
 {
        int i;
        struct page *page;
 
-       i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
+       i = rbio_stripe_page_index(rbio, rbio->nr_data, 0);
 
        for (; i < rbio->nr_pages; i++) {
                if (rbio->stripe_pages[i])
@@ -1120,18 +1133,6 @@ static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
        }
 }
 
-/*
- * these are just the pages from the rbio array, not from anything
- * the FS sent down to us
- */
-static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
-{
-       int index;
-       index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
-       index += page;
-       return rbio->stripe_pages[index];
-}
-
 /*
  * helper function to walk our bio list and populate the bio_pages array with
  * the result.  This seems expensive, but it is faster than constantly
@@ -1175,7 +1176,6 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 {
        struct btrfs_bio *bbio = rbio->bbio;
        void *pointers[rbio->real_stripes];
-       int stripe_len = rbio->stripe_len;
        int nr_data = rbio->nr_data;
        int stripe;
        int pagenr;
@@ -1183,7 +1183,6 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
        int q_stripe = -1;
        struct bio_list bio_list;
        struct bio *bio;
-       int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
        int ret;
 
        bio_list_init(&bio_list);
@@ -1226,7 +1225,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
        else
                clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
 
-       for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
+       for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
                struct page *p;
                /* first collect one page from each data stripe */
                for (stripe = 0; stripe < nr_data; stripe++) {
@@ -1268,7 +1267,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
         * everything else.
         */
        for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
-               for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
+               for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
                        struct page *page;
                        if (stripe < rbio->nr_data) {
                                page = page_in_rbio(rbio, stripe, pagenr, 1);
@@ -1292,7 +1291,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
                if (!bbio->tgtdev_map[stripe])
                        continue;
 
-               for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
+               for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
                        struct page *page;
                        if (stripe < rbio->nr_data) {
                                page = page_in_rbio(rbio, stripe, pagenr, 1);
@@ -1506,7 +1505,6 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
        int bios_to_read = 0;
        struct bio_list bio_list;
        int ret;
-       int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
        int pagenr;
        int stripe;
        struct bio *bio;
@@ -1525,7 +1523,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
         * stripe
         */
        for (stripe = 0; stripe < rbio->nr_data; stripe++) {
-               for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+               for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
                        struct page *page;
                        /*
                         * we want to find all the pages missing from
@@ -1801,7 +1799,6 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
        int pagenr, stripe;
        void **pointers;
        int faila = -1, failb = -1;
-       int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
        struct page *page;
        int err;
        int i;
@@ -1824,7 +1821,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
 
        index_rbio_pages(rbio);
 
-       for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+       for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
                /*
                 * Now we just use bitmap to mark the horizontal stripes in
                 * which we have data when doing parity scrub.
@@ -1935,7 +1932,7 @@ pstripe:
                 * other endio functions will fiddle the uptodate bits
                 */
                if (rbio->operation == BTRFS_RBIO_WRITE) {
-                       for (i = 0;  i < nr_pages; i++) {
+                       for (i = 0;  i < rbio->stripe_npages; i++) {
                                if (faila != -1) {
                                        page = rbio_stripe_page(rbio, faila, i);
                                        SetPageUptodate(page);
@@ -2031,7 +2028,6 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
        int bios_to_read = 0;
        struct bio_list bio_list;
        int ret;
-       int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
        int pagenr;
        int stripe;
        struct bio *bio;
@@ -2055,7 +2051,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
                        continue;
                }
 
-               for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+               for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
                        struct page *p;
 
                        /*
@@ -2279,37 +2275,11 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
                        if (!page)
                                return -ENOMEM;
                        rbio->stripe_pages[index] = page;
-                       ClearPageUptodate(page);
                }
        }
        return 0;
 }
 
-/*
- * end io function used by finish_rmw.  When we finally
- * get here, we've written a full stripe
- */
-static void raid_write_parity_end_io(struct bio *bio)
-{
-       struct btrfs_raid_bio *rbio = bio->bi_private;
-       int err = bio->bi_error;
-
-       if (bio->bi_error)
-               fail_bio_stripe(rbio, bio);
-
-       bio_put(bio);
-
-       if (!atomic_dec_and_test(&rbio->stripes_pending))
-               return;
-
-       err = 0;
-
-       if (atomic_read(&rbio->error))
-               err = -EIO;
-
-       rbio_orig_end_io(rbio, err);
-}
-
 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
                                         int need_check)
 {
@@ -2462,7 +2432,7 @@ submit_write:
                        break;
 
                bio->bi_private = rbio;
-               bio->bi_end_io = raid_write_parity_end_io;
+               bio->bi_end_io = raid_write_end_io;
                submit_bio(WRITE, bio);
        }
        return;
index ef6d8fc85853851da0da03923b75421982d65d8f..fd1c4d982463fd92768537f86f8b9ae8545befba 100644 (file)
@@ -3030,7 +3030,7 @@ int prealloc_file_extent_cluster(struct inode *inode,
        int ret = 0;
 
        BUG_ON(cluster->start != cluster->boundary[0]);
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        ret = btrfs_check_data_free_space(inode, cluster->start,
                                          cluster->end + 1 - cluster->start);
@@ -3057,7 +3057,7 @@ int prealloc_file_extent_cluster(struct inode *inode,
        btrfs_free_reserved_data_space(inode, cluster->start,
                                       cluster->end + 1 - cluster->start);
 out:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return ret;
 }
 
index 0c981ebe2acb427d5684bca46cc1281e61753905..92bf5ee732fb0e14f7e330dddaadccb4b70b35cc 100644 (file)
@@ -2813,7 +2813,7 @@ out:
 
 static inline int scrub_calc_parity_bitmap_len(int nsectors)
 {
-       return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8);
+       return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);
 }
 
 static void scrub_parity_get(struct scrub_parity *sparity)
@@ -3458,7 +3458,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
                return ret;
        }
 
-       map = (struct map_lookup *)em->bdev;
+       map = em->map_lookup;
        if (em->start != chunk_offset)
                goto out;
 
@@ -4279,7 +4279,7 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
                return PTR_ERR(inode);
 
        /* Avoid truncate/dio/punch hole.. */
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        inode_dio_wait(inode);
 
        physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
@@ -4358,7 +4358,7 @@ next_page:
        }
        ret = COPY_COMPLETE;
 out:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        iput(inode);
        return ret;
 }
index 9b9eab6d048e93d32963b0c66a6d9ab6c022ee63..d41e09fe8e38d77674862c7fe61f610a81d1159d 100644 (file)
@@ -383,6 +383,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
        int ret = 0;
        char *compress_type;
        bool compress_force = false;
+       enum btrfs_compression_type saved_compress_type;
+       bool saved_compress_force;
+       int no_compress = 0;
 
        cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
        if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE))
@@ -462,6 +465,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                        /* Fallthrough */
                case Opt_compress:
                case Opt_compress_type:
+                       saved_compress_type = btrfs_test_opt(root, COMPRESS) ?
+                               info->compress_type : BTRFS_COMPRESS_NONE;
+                       saved_compress_force =
+                               btrfs_test_opt(root, FORCE_COMPRESS);
                        if (token == Opt_compress ||
                            token == Opt_compress_force ||
                            strcmp(args[0].from, "zlib") == 0) {
@@ -470,6 +477,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                                btrfs_set_opt(info->mount_opt, COMPRESS);
                                btrfs_clear_opt(info->mount_opt, NODATACOW);
                                btrfs_clear_opt(info->mount_opt, NODATASUM);
+                               no_compress = 0;
                        } else if (strcmp(args[0].from, "lzo") == 0) {
                                compress_type = "lzo";
                                info->compress_type = BTRFS_COMPRESS_LZO;
@@ -477,25 +485,21 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                                btrfs_clear_opt(info->mount_opt, NODATACOW);
                                btrfs_clear_opt(info->mount_opt, NODATASUM);
                                btrfs_set_fs_incompat(info, COMPRESS_LZO);
+                               no_compress = 0;
                        } else if (strncmp(args[0].from, "no", 2) == 0) {
                                compress_type = "no";
                                btrfs_clear_opt(info->mount_opt, COMPRESS);
                                btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
                                compress_force = false;
+                               no_compress++;
                        } else {
                                ret = -EINVAL;
                                goto out;
                        }
 
                        if (compress_force) {
-                               btrfs_set_and_info(root, FORCE_COMPRESS,
-                                                  "force %s compression",
-                                                  compress_type);
+                               btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
                        } else {
-                               if (!btrfs_test_opt(root, COMPRESS))
-                                       btrfs_info(root->fs_info,
-                                                  "btrfs: use %s compression",
-                                                  compress_type);
                                /*
                                 * If we remount from compress-force=xxx to
                                 * compress=xxx, we need clear FORCE_COMPRESS
@@ -504,6 +508,17 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                                 */
                                btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
                        }
+                       if ((btrfs_test_opt(root, COMPRESS) &&
+                            (info->compress_type != saved_compress_type ||
+                             compress_force != saved_compress_force)) ||
+                           (!btrfs_test_opt(root, COMPRESS) &&
+                            no_compress == 1)) {
+                               btrfs_info(root->fs_info,
+                                          "%s %s compression",
+                                          (compress_force) ? "force" : "use",
+                                          compress_type);
+                       }
+                       compress_force = false;
                        break;
                case Opt_ssd:
                        btrfs_set_and_info(root, SSD,
index c32abbca9d77f65684b85d137fa41bec8bac005c..366b335946fae763e380ed884792df5efe07a2c9 100644 (file)
@@ -108,7 +108,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
        },
 };
 
-const u64 const btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
+const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
        [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10,
        [BTRFS_RAID_RAID1]  = BTRFS_BLOCK_GROUP_RAID1,
        [BTRFS_RAID_DUP]    = BTRFS_BLOCK_GROUP_DUP,
@@ -233,6 +233,7 @@ static struct btrfs_device *__alloc_device(void)
        spin_lock_init(&dev->reada_lock);
        atomic_set(&dev->reada_in_flight, 0);
        atomic_set(&dev->dev_stats_ccnt, 0);
+       btrfs_device_data_ordered_init(dev);
        INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
        INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
 
@@ -1183,7 +1184,7 @@ again:
                struct map_lookup *map;
                int i;
 
-               map = (struct map_lookup *)em->bdev;
+               map = em->map_lookup;
                for (i = 0; i < map->num_stripes; i++) {
                        u64 end;
 
@@ -2755,7 +2756,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
                        free_extent_map(em);
                return -EINVAL;
        }
-       map = (struct map_lookup *)em->bdev;
+       map = em->map_lookup;
        lock_chunks(root->fs_info->chunk_root);
        check_system_chunk(trans, extent_root, map->type);
        unlock_chunks(root->fs_info->chunk_root);
@@ -3751,7 +3752,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
        if (btrfs_get_num_tolerated_disk_barrier_failures(bctl->meta.target) <
                btrfs_get_num_tolerated_disk_barrier_failures(bctl->data.target)) {
                btrfs_warn(fs_info,
-       "metatdata profile 0x%llx has lower redundancy than data profile 0x%llx",
+       "metadata profile 0x%llx has lower redundancy than data profile 0x%llx",
                        bctl->meta.target, bctl->data.target);
        }
 
@@ -4718,7 +4719,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                goto error;
        }
        set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
-       em->bdev = (struct block_device *)map;
+       em->map_lookup = map;
        em->start = start;
        em->len = num_bytes;
        em->block_start = 0;
@@ -4813,7 +4814,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
                return -EINVAL;
        }
 
-       map = (struct map_lookup *)em->bdev;
+       map = em->map_lookup;
        item_size = btrfs_chunk_item_size(map->num_stripes);
        stripe_size = em->orig_block_len;
 
@@ -4968,7 +4969,7 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
        if (!em)
                return 1;
 
-       map = (struct map_lookup *)em->bdev;
+       map = em->map_lookup;
        for (i = 0; i < map->num_stripes; i++) {
                if (map->stripes[i].dev->missing) {
                        miss_ndevs++;
@@ -5048,7 +5049,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
                return 1;
        }
 
-       map = (struct map_lookup *)em->bdev;
+       map = em->map_lookup;
        if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
                ret = map->num_stripes;
        else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
@@ -5084,7 +5085,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
        BUG_ON(!em);
 
        BUG_ON(em->start > logical || em->start + em->len < logical);
-       map = (struct map_lookup *)em->bdev;
+       map = em->map_lookup;
        if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
                len = map->stripe_len * nr_data_stripes(map);
        free_extent_map(em);
@@ -5105,7 +5106,7 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
        BUG_ON(!em);
 
        BUG_ON(em->start > logical || em->start + em->len < logical);
-       map = (struct map_lookup *)em->bdev;
+       map = em->map_lookup;
        if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
                ret = 1;
        free_extent_map(em);
@@ -5264,7 +5265,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                return -EINVAL;
        }
 
-       map = (struct map_lookup *)em->bdev;
+       map = em->map_lookup;
        offset = logical - em->start;
 
        stripe_len = map->stripe_len;
@@ -5378,35 +5379,33 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                 * target drive.
                 */
                for (i = 0; i < tmp_num_stripes; i++) {
-                       if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) {
-                               /*
-                                * In case of DUP, in order to keep it
-                                * simple, only add the mirror with the
-                                * lowest physical address
-                                */
-                               if (found &&
-                                   physical_of_found <=
-                                    tmp_bbio->stripes[i].physical)
-                                       continue;
-                               index_srcdev = i;
-                               found = 1;
-                               physical_of_found =
-                                       tmp_bbio->stripes[i].physical;
-                       }
+                       if (tmp_bbio->stripes[i].dev->devid != srcdev_devid)
+                               continue;
+
+                       /*
+                        * In case of DUP, in order to keep it simple, only add
+                        * the mirror with the lowest physical address
+                        */
+                       if (found &&
+                           physical_of_found <= tmp_bbio->stripes[i].physical)
+                               continue;
+
+                       index_srcdev = i;
+                       found = 1;
+                       physical_of_found = tmp_bbio->stripes[i].physical;
                }
 
-               if (found) {
-                       mirror_num = index_srcdev + 1;
-                       patch_the_first_stripe_for_dev_replace = 1;
-                       physical_to_patch_in_first_stripe = physical_of_found;
-               } else {
+               btrfs_put_bbio(tmp_bbio);
+
+               if (!found) {
                        WARN_ON(1);
                        ret = -EIO;
-                       btrfs_put_bbio(tmp_bbio);
                        goto out;
                }
 
-               btrfs_put_bbio(tmp_bbio);
+               mirror_num = index_srcdev + 1;
+               patch_the_first_stripe_for_dev_replace = 1;
+               physical_to_patch_in_first_stripe = physical_of_found;
        } else if (mirror_num > map->num_stripes) {
                mirror_num = 0;
        }
@@ -5806,7 +5805,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
                free_extent_map(em);
                return -EIO;
        }
-       map = (struct map_lookup *)em->bdev;
+       map = em->map_lookup;
 
        length = em->len;
        rmap_len = map->stripe_len;
@@ -6069,7 +6068,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
        bbio->fs_info = root->fs_info;
        atomic_set(&bbio->stripes_pending, bbio->num_stripes);
 
-       if (bbio->raid_map) {
+       if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
+           ((rw & WRITE) || (mirror_num > 1))) {
                /* In this case, map_length has been set to the length of
                   a single stripe; not the whole write */
                if (rw & WRITE) {
@@ -6210,6 +6210,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
        struct extent_map *em;
        u64 logical;
        u64 length;
+       u64 stripe_len;
        u64 devid;
        u8 uuid[BTRFS_UUID_SIZE];
        int num_stripes;
@@ -6218,6 +6219,37 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 
        logical = key->offset;
        length = btrfs_chunk_length(leaf, chunk);
+       stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
+       num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+       /* Validation check */
+       if (!num_stripes) {
+               btrfs_err(root->fs_info, "invalid chunk num_stripes: %u",
+                         num_stripes);
+               return -EIO;
+       }
+       if (!IS_ALIGNED(logical, root->sectorsize)) {
+               btrfs_err(root->fs_info,
+                         "invalid chunk logical %llu", logical);
+               return -EIO;
+       }
+       if (!length || !IS_ALIGNED(length, root->sectorsize)) {
+               btrfs_err(root->fs_info,
+                       "invalid chunk length %llu", length);
+               return -EIO;
+       }
+       if (!is_power_of_2(stripe_len)) {
+               btrfs_err(root->fs_info, "invalid chunk stripe length: %llu",
+                         stripe_len);
+               return -EIO;
+       }
+       if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) &
+           btrfs_chunk_type(leaf, chunk)) {
+               btrfs_err(root->fs_info, "unrecognized chunk type: %llu",
+                         ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
+                           BTRFS_BLOCK_GROUP_PROFILE_MASK) &
+                         btrfs_chunk_type(leaf, chunk));
+               return -EIO;
+       }
 
        read_lock(&map_tree->map_tree.lock);
        em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
@@ -6234,7 +6266,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
        em = alloc_extent_map();
        if (!em)
                return -ENOMEM;
-       num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
        map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
        if (!map) {
                free_extent_map(em);
@@ -6242,7 +6273,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
        }
 
        set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
-       em->bdev = (struct block_device *)map;
+       em->map_lookup = map;
        em->start = logical;
        em->len = length;
        em->orig_start = 0;
@@ -6944,7 +6975,7 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
        /* In order to kick the device replace finish process */
        lock_chunks(root);
        list_for_each_entry(em, &transaction->pending_chunks, list) {
-               map = (struct map_lookup *)em->bdev;
+               map = em->map_lookup;
 
                for (i = 0; i < map->num_stripes; i++) {
                        dev = map->stripes[i].dev;
index fd953c361a43c7c7f0faf3e100df12b052c06552..6c68d6356197afb630443f383100894227b07e65 100644 (file)
@@ -126,7 +126,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
         * locks the inode's i_mutex before calling setxattr or removexattr.
         */
        if (flags & XATTR_REPLACE) {
-               ASSERT(mutex_is_locked(&inode->i_mutex));
+               ASSERT(inode_is_locked(inode));
                di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
                                        name, name_len, 0);
                if (!di)
index afa023dded5be937ddd713847b37f2d165f586be..675a3332d72fad1eed4cba643959dbb9dabde051 100644 (file)
@@ -446,7 +446,7 @@ static int cachefiles_attr_changed(struct fscache_object *_object)
                return 0;
 
        cachefiles_begin_secure(cache, &saved_cred);
-       mutex_lock(&d_inode(object->backer)->i_mutex);
+       inode_lock(d_inode(object->backer));
 
        /* if there's an extension to a partial page at the end of the backing
         * file, we need to discard the partial page so that we pick up new
@@ -465,7 +465,7 @@ static int cachefiles_attr_changed(struct fscache_object *_object)
        ret = notify_change(object->backer, &newattrs, NULL);
 
 truncate_failed:
-       mutex_unlock(&d_inode(object->backer)->i_mutex);
+       inode_unlock(d_inode(object->backer));
        cachefiles_end_secure(cache, saved_cred);
 
        if (ret == -EIO) {
index c4b893453e0eefda8475b3537e934f2bfc46d268..1c2334c163ddea17f99298b89ec58e09c51d3bd0 100644 (file)
@@ -295,7 +295,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
                                cachefiles_mark_object_buried(cache, rep, why);
                }
 
-               mutex_unlock(&d_inode(dir)->i_mutex);
+               inode_unlock(d_inode(dir));
 
                if (ret == -EIO)
                        cachefiles_io_error(cache, "Unlink failed");
@@ -306,7 +306,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
 
        /* directories have to be moved to the graveyard */
        _debug("move stale object to graveyard");
-       mutex_unlock(&d_inode(dir)->i_mutex);
+       inode_unlock(d_inode(dir));
 
 try_again:
        /* first step is to make up a grave dentry in the graveyard */
@@ -423,13 +423,13 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
 
        dir = dget_parent(object->dentry);
 
-       mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
+       inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
 
        if (test_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->fscache.flags)) {
                /* object allocation for the same key preemptively deleted this
                 * object's file so that it could create its own file */
                _debug("object preemptively buried");
-               mutex_unlock(&d_inode(dir)->i_mutex);
+               inode_unlock(d_inode(dir));
                ret = 0;
        } else {
                /* we need to check that our parent is _still_ our parent - it
@@ -442,7 +442,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
                        /* it got moved, presumably by cachefilesd culling it,
                         * so it's no longer in the key path and we can ignore
                         * it */
-                       mutex_unlock(&d_inode(dir)->i_mutex);
+                       inode_unlock(d_inode(dir));
                        ret = 0;
                }
        }
@@ -501,7 +501,7 @@ lookup_again:
        /* search the current directory for the element name */
        _debug("lookup '%s'", name);
 
-       mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
+       inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
 
        start = jiffies;
        next = lookup_one_len(name, dir, nlen);
@@ -585,7 +585,7 @@ lookup_again:
        /* process the next component */
        if (key) {
                _debug("advance");
-               mutex_unlock(&d_inode(dir)->i_mutex);
+               inode_unlock(d_inode(dir));
                dput(dir);
                dir = next;
                next = NULL;
@@ -623,7 +623,7 @@ lookup_again:
        /* note that we're now using this object */
        ret = cachefiles_mark_object_active(cache, object);
 
-       mutex_unlock(&d_inode(dir)->i_mutex);
+       inode_unlock(d_inode(dir));
        dput(dir);
        dir = NULL;
 
@@ -705,7 +705,7 @@ lookup_error:
                cachefiles_io_error(cache, "Lookup failed");
        next = NULL;
 error:
-       mutex_unlock(&d_inode(dir)->i_mutex);
+       inode_unlock(d_inode(dir));
        dput(next);
 error_out2:
        dput(dir);
@@ -729,7 +729,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
        _enter(",,%s", dirname);
 
        /* search the current directory for the element name */
-       mutex_lock(&d_inode(dir)->i_mutex);
+       inode_lock(d_inode(dir));
 
        start = jiffies;
        subdir = lookup_one_len(dirname, dir, strlen(dirname));
@@ -768,7 +768,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
                       d_backing_inode(subdir)->i_ino);
        }
 
-       mutex_unlock(&d_inode(dir)->i_mutex);
+       inode_unlock(d_inode(dir));
 
        /* we need to make sure the subdir is a directory */
        ASSERT(d_backing_inode(subdir));
@@ -800,19 +800,19 @@ check_error:
        return ERR_PTR(ret);
 
 mkdir_error:
-       mutex_unlock(&d_inode(dir)->i_mutex);
+       inode_unlock(d_inode(dir));
        dput(subdir);
        pr_err("mkdir %s failed with error %d\n", dirname, ret);
        return ERR_PTR(ret);
 
 lookup_error:
-       mutex_unlock(&d_inode(dir)->i_mutex);
+       inode_unlock(d_inode(dir));
        ret = PTR_ERR(subdir);
        pr_err("Lookup %s failed with error %d\n", dirname, ret);
        return ERR_PTR(ret);
 
 nomem_d_alloc:
-       mutex_unlock(&d_inode(dir)->i_mutex);
+       inode_unlock(d_inode(dir));
        _leave(" = -ENOMEM");
        return ERR_PTR(-ENOMEM);
 }
@@ -837,7 +837,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
        //       dir, filename);
 
        /* look up the victim */
-       mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
+       inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
 
        start = jiffies;
        victim = lookup_one_len(filename, dir, strlen(filename));
@@ -852,7 +852,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
         * at the netfs's request whilst the cull was in progress
         */
        if (d_is_negative(victim)) {
-               mutex_unlock(&d_inode(dir)->i_mutex);
+               inode_unlock(d_inode(dir));
                dput(victim);
                _leave(" = -ENOENT [absent]");
                return ERR_PTR(-ENOENT);
@@ -881,13 +881,13 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
 
 object_in_use:
        read_unlock(&cache->active_lock);
-       mutex_unlock(&d_inode(dir)->i_mutex);
+       inode_unlock(d_inode(dir));
        dput(victim);
        //_leave(" = -EBUSY [in use]");
        return ERR_PTR(-EBUSY);
 
 lookup_error:
-       mutex_unlock(&d_inode(dir)->i_mutex);
+       inode_unlock(d_inode(dir));
        ret = PTR_ERR(victim);
        if (ret == -ENOENT) {
                /* file or dir now absent - probably retired by netfs */
@@ -947,7 +947,7 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
        return 0;
 
 error_unlock:
-       mutex_unlock(&d_inode(dir)->i_mutex);
+       inode_unlock(d_inode(dir));
 error:
        dput(victim);
        if (ret == -ENOENT) {
@@ -982,7 +982,7 @@ int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir,
        if (IS_ERR(victim))
                return PTR_ERR(victim);
 
-       mutex_unlock(&d_inode(dir)->i_mutex);
+       inode_unlock(d_inode(dir));
        dput(victim);
        //_leave(" = 0");
        return 0;
index b7d218a168fb81c2c028ea38ffccf2a17a1d68d5..c22213789090f975b3680b10e022a1e5f2f9fdb4 100644 (file)
@@ -1108,7 +1108,7 @@ retry_locked:
                return 0;
 
        /* past end of file? */
-       i_size = inode->i_size;   /* caller holds i_mutex */
+       i_size = i_size_read(inode);
 
        if (page_off >= i_size ||
            (pos_in_page == 0 && (pos+len) >= i_size &&
@@ -1149,7 +1149,6 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
                page = grab_cache_page_write_begin(mapping, index, 0);
                if (!page)
                        return -ENOMEM;
-               *pagep = page;
 
                dout("write_begin file %p inode %p page %p %d~%d\n", file,
                     inode, page, (int)pos, (int)len);
@@ -1184,8 +1183,7 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
                zero_user_segment(page, from+copied, len);
 
        /* did file size increase? */
-       /* (no need for i_size_read(); we caller holds i_mutex */
-       if (pos+copied > inode->i_size)
+       if (pos+copied > i_size_read(inode))
                check_cap = ceph_inode_set_size(inode, pos+copied);
 
        if (!PageUptodate(page))
@@ -1378,11 +1376,13 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 
        ret = VM_FAULT_NOPAGE;
        if ((off > size) ||
-           (page->mapping != inode->i_mapping))
+           (page->mapping != inode->i_mapping)) {
+               unlock_page(page);
                goto out;
+       }
 
        ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
-       if (ret == 0) {
+       if (ret >= 0) {
                /* success.  we'll keep the page locked. */
                set_page_dirty(page);
                ret = VM_FAULT_LOCKED;
@@ -1393,8 +1393,6 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
                        ret = VM_FAULT_SIGBUS;
        }
 out:
-       if (ret != VM_FAULT_LOCKED)
-               unlock_page(page);
        if (ret == VM_FAULT_LOCKED ||
            ci->i_inline_version != CEPH_INLINE_NONE) {
                int dirty;
index a4766ded1ba78e8bbbff70d4c5e65b3f9060d687..a351480dbabc95891e4b61f83fb92485f8ea7b18 100644 (file)
@@ -106,7 +106,7 @@ static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data,
 
        memset(&aux, 0, sizeof(aux));
        aux.mtime = inode->i_mtime;
-       aux.size = inode->i_size;
+       aux.size = i_size_read(inode);
 
        memcpy(buffer, &aux, sizeof(aux));
 
@@ -117,9 +117,7 @@ static void ceph_fscache_inode_get_attr(const void *cookie_netfs_data,
                                        uint64_t *size)
 {
        const struct ceph_inode_info* ci = cookie_netfs_data;
-       const struct inode* inode = &ci->vfs_inode;
-
-       *size = inode->i_size;
+       *size = i_size_read(&ci->vfs_inode);
 }
 
 static enum fscache_checkaux ceph_fscache_inode_check_aux(
@@ -134,7 +132,7 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux(
 
        memset(&aux, 0, sizeof(aux));
        aux.mtime = inode->i_mtime;
-       aux.size = inode->i_size;
+       aux.size = i_size_read(inode);
 
        if (memcmp(data, &aux, sizeof(aux)) != 0)
                return FSCACHE_CHECKAUX_OBSOLETE;
@@ -197,7 +195,7 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
                return;
 
        /* Avoid multiple racing open requests */
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        if (ci->fscache)
                goto done;
@@ -207,7 +205,7 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
                                             ci, true);
        fscache_check_consistency(ci->fscache);
 done:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
 }
 
index c69e1253b47bfbefb2eb2cb1564a5c4b9d87f9a3..cdbf8cf3d52c3354569b7c86a709f4ce3fc09e2c 100644 (file)
@@ -2030,7 +2030,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
        if (datasync)
                goto out;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        dirty = try_flush_caps(inode, &flush_tid);
        dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
@@ -2046,7 +2046,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
                ret = wait_event_interruptible(ci->i_cap_wq,
                                        caps_are_flushed(inode, flush_tid));
        }
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 out:
        dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret);
        return ret;
index 9314b4ea2375145647aa16446a1f33ae2c8106b2..fd11fb231a2ea796e86eae933265a488ad6774e5 100644 (file)
@@ -507,7 +507,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
        loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset);
        loff_t retval;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        retval = -EINVAL;
        switch (whence) {
        case SEEK_CUR:
@@ -542,7 +542,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
                }
        }
 out:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return retval;
 }
 
index fe02ae7f056a332220929ce653e822a0a39e2605..3b31723573268e924346c1bffddb7361390b2a8d 100644 (file)
@@ -215,7 +215,7 @@ static int ceph_get_name(struct dentry *parent, char *name,
        if (IS_ERR(req))
                return PTR_ERR(req);
 
-       mutex_lock(&d_inode(parent)->i_mutex);
+       inode_lock(d_inode(parent));
 
        req->r_inode = d_inode(child);
        ihold(d_inode(child));
@@ -224,7 +224,7 @@ static int ceph_get_name(struct dentry *parent, char *name,
        req->r_num_caps = 2;
        err = ceph_mdsc_do_request(mdsc, NULL, req);
 
-       mutex_unlock(&d_inode(parent)->i_mutex);
+       inode_unlock(d_inode(parent));
 
        if (!err) {
                struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
index 3c68e6aee2f0531bdd8afb0125bbae48419d0dfb..86a9c383955e56037eb38419b1e4617317d32237 100644 (file)
@@ -397,8 +397,9 @@ int ceph_release(struct inode *inode, struct file *file)
 }
 
 enum {
-       CHECK_EOF = 1,
-       READ_INLINE = 2,
+       HAVE_RETRIED = 1,
+       CHECK_EOF =    2,
+       READ_INLINE =  3,
 };
 
 /*
@@ -411,17 +412,15 @@ enum {
 static int striped_read(struct inode *inode,
                        u64 off, u64 len,
                        struct page **pages, int num_pages,
-                       int *checkeof, bool o_direct,
-                       unsigned long buf_align)
+                       int *checkeof)
 {
        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
        struct ceph_inode_info *ci = ceph_inode(inode);
        u64 pos, this_len, left;
-       int io_align, page_align;
-       int pages_left;
-       int read;
+       loff_t i_size;
+       int page_align, pages_left;
+       int read, ret;
        struct page **page_pos;
-       int ret;
        bool hit_stripe, was_short;
 
        /*
@@ -432,13 +431,9 @@ static int striped_read(struct inode *inode,
        page_pos = pages;
        pages_left = num_pages;
        read = 0;
-       io_align = off & ~PAGE_MASK;
 
 more:
-       if (o_direct)
-               page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
-       else
-               page_align = pos & ~PAGE_MASK;
+       page_align = pos & ~PAGE_MASK;
        this_len = left;
        ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
                                  &ci->i_layout, pos, &this_len,
@@ -452,13 +447,12 @@ more:
        dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, left, read,
             ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
 
+       i_size = i_size_read(inode);
        if (ret >= 0) {
                int didpages;
-               if (was_short && (pos + ret < inode->i_size)) {
-                       int zlen = min(this_len - ret,
-                                      inode->i_size - pos - ret);
-                       int zoff = (o_direct ? buf_align : io_align) +
-                                   read + ret;
+               if (was_short && (pos + ret < i_size)) {
+                       int zlen = min(this_len - ret, i_size - pos - ret);
+                       int zoff = (off & ~PAGE_MASK) + read + ret;
                        dout(" zero gap %llu to %llu\n",
                                pos + ret, pos + ret + zlen);
                        ceph_zero_page_vector_range(zoff, zlen, pages);
@@ -473,14 +467,14 @@ more:
                pages_left -= didpages;
 
                /* hit stripe and need continue*/
-               if (left && hit_stripe && pos < inode->i_size)
+               if (left && hit_stripe && pos < i_size)
                        goto more;
        }
 
        if (read > 0) {
                ret = read;
                /* did we bounce off eof? */
-               if (pos + left > inode->i_size)
+               if (pos + left > i_size)
                        *checkeof = CHECK_EOF;
        }
 
@@ -521,54 +515,28 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
        if (ret < 0)
                return ret;
 
-       if (iocb->ki_flags & IOCB_DIRECT) {
-               while (iov_iter_count(i)) {
-                       size_t start;
-                       ssize_t n;
-
-                       n = dio_get_pagev_size(i);
-                       pages = dio_get_pages_alloc(i, n, &start, &num_pages);
-                       if (IS_ERR(pages))
-                               return PTR_ERR(pages);
-
-                       ret = striped_read(inode, off, n,
-                                          pages, num_pages, checkeof,
-                                          1, start);
-
-                       ceph_put_page_vector(pages, num_pages, true);
-
-                       if (ret <= 0)
-                               break;
-                       off += ret;
-                       iov_iter_advance(i, ret);
-                       if (ret < n)
+       num_pages = calc_pages_for(off, len);
+       pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+       if (IS_ERR(pages))
+               return PTR_ERR(pages);
+       ret = striped_read(inode, off, len, pages,
+                               num_pages, checkeof);
+       if (ret > 0) {
+               int l, k = 0;
+               size_t left = ret;
+
+               while (left) {
+                       size_t page_off = off & ~PAGE_MASK;
+                       size_t copy = min_t(size_t, left,
+                                           PAGE_SIZE - page_off);
+                       l = copy_page_to_iter(pages[k++], page_off, copy, i);
+                       off += l;
+                       left -= l;
+                       if (l < copy)
                                break;
                }
-       } else {
-               num_pages = calc_pages_for(off, len);
-               pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
-               if (IS_ERR(pages))
-                       return PTR_ERR(pages);
-               ret = striped_read(inode, off, len, pages,
-                                       num_pages, checkeof, 0, 0);
-               if (ret > 0) {
-                       int l, k = 0;
-                       size_t left = ret;
-
-                       while (left) {
-                               size_t page_off = off & ~PAGE_MASK;
-                               size_t copy = min_t(size_t,
-                                                   PAGE_SIZE - page_off, left);
-                               l = copy_page_to_iter(pages[k++], page_off,
-                                                     copy, i);
-                               off += l;
-                               left -= l;
-                               if (l < copy)
-                                       break;
-                       }
-               }
-               ceph_release_page_vector(pages, num_pages);
        }
+       ceph_release_page_vector(pages, num_pages);
 
        if (off > iocb->ki_pos) {
                ret = off - iocb->ki_pos;
@@ -579,6 +547,193 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
        return ret;
 }
 
+struct ceph_aio_request {
+       struct kiocb *iocb;
+       size_t total_len;
+       int write;
+       int error;
+       struct list_head osd_reqs;
+       unsigned num_reqs;
+       atomic_t pending_reqs;
+       struct timespec mtime;
+       struct ceph_cap_flush *prealloc_cf;
+};
+
+struct ceph_aio_work {
+       struct work_struct work;
+       struct ceph_osd_request *req;
+};
+
+static void ceph_aio_retry_work(struct work_struct *work);
+
+static void ceph_aio_complete(struct inode *inode,
+                             struct ceph_aio_request *aio_req)
+{
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       int ret;
+
+       if (!atomic_dec_and_test(&aio_req->pending_reqs))
+               return;
+
+       ret = aio_req->error;
+       if (!ret)
+               ret = aio_req->total_len;
+
+       dout("ceph_aio_complete %p rc %d\n", inode, ret);
+
+       if (ret >= 0 && aio_req->write) {
+               int dirty;
+
+               loff_t endoff = aio_req->iocb->ki_pos + aio_req->total_len;
+               if (endoff > i_size_read(inode)) {
+                       if (ceph_inode_set_size(inode, endoff))
+                               ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+               }
+
+               spin_lock(&ci->i_ceph_lock);
+               ci->i_inline_version = CEPH_INLINE_NONE;
+               dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
+                                              &aio_req->prealloc_cf);
+               spin_unlock(&ci->i_ceph_lock);
+               if (dirty)
+                       __mark_inode_dirty(inode, dirty);
+
+       }
+
+       ceph_put_cap_refs(ci, (aio_req->write ? CEPH_CAP_FILE_WR :
+                                               CEPH_CAP_FILE_RD));
+
+       aio_req->iocb->ki_complete(aio_req->iocb, ret, 0);
+
+       ceph_free_cap_flush(aio_req->prealloc_cf);
+       kfree(aio_req);
+}
+
+static void ceph_aio_complete_req(struct ceph_osd_request *req,
+                                 struct ceph_msg *msg)
+{
+       int rc = req->r_result;
+       struct inode *inode = req->r_inode;
+       struct ceph_aio_request *aio_req = req->r_priv;
+       struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
+       int num_pages = calc_pages_for((u64)osd_data->alignment,
+                                      osd_data->length);
+
+       dout("ceph_aio_complete_req %p rc %d bytes %llu\n",
+            inode, rc, osd_data->length);
+
+       if (rc == -EOLDSNAPC) {
+               struct ceph_aio_work *aio_work;
+               BUG_ON(!aio_req->write);
+
+               aio_work = kmalloc(sizeof(*aio_work), GFP_NOFS);
+               if (aio_work) {
+                       INIT_WORK(&aio_work->work, ceph_aio_retry_work);
+                       aio_work->req = req;
+                       queue_work(ceph_inode_to_client(inode)->wb_wq,
+                                  &aio_work->work);
+                       return;
+               }
+               rc = -ENOMEM;
+       } else if (!aio_req->write) {
+               if (rc == -ENOENT)
+                       rc = 0;
+               if (rc >= 0 && osd_data->length > rc) {
+                       int zoff = osd_data->alignment + rc;
+                       int zlen = osd_data->length - rc;
+                       /*
+                        * If read is satisfied by single OSD request,
+                        * it can pass EOF. Otherwise read is within
+                        * i_size.
+                        */
+                       if (aio_req->num_reqs == 1) {
+                               loff_t i_size = i_size_read(inode);
+                               loff_t endoff = aio_req->iocb->ki_pos + rc;
+                               if (endoff < i_size)
+                                       zlen = min_t(size_t, zlen,
+                                                    i_size - endoff);
+                               aio_req->total_len = rc + zlen;
+                       }
+
+                       if (zlen > 0)
+                               ceph_zero_page_vector_range(zoff, zlen,
+                                                           osd_data->pages);
+               }
+       }
+
+       ceph_put_page_vector(osd_data->pages, num_pages, false);
+       ceph_osdc_put_request(req);
+
+       if (rc < 0)
+               cmpxchg(&aio_req->error, 0, rc);
+
+       ceph_aio_complete(inode, aio_req);
+       return;
+}
+
+static void ceph_aio_retry_work(struct work_struct *work)
+{
+       struct ceph_aio_work *aio_work =
+               container_of(work, struct ceph_aio_work, work);
+       struct ceph_osd_request *orig_req = aio_work->req;
+       struct ceph_aio_request *aio_req = orig_req->r_priv;
+       struct inode *inode = orig_req->r_inode;
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       struct ceph_snap_context *snapc;
+       struct ceph_osd_request *req;
+       int ret;
+
+       spin_lock(&ci->i_ceph_lock);
+       if (__ceph_have_pending_cap_snap(ci)) {
+               struct ceph_cap_snap *capsnap =
+                       list_last_entry(&ci->i_cap_snaps,
+                                       struct ceph_cap_snap,
+                                       ci_item);
+               snapc = ceph_get_snap_context(capsnap->context);
+       } else {
+               BUG_ON(!ci->i_head_snapc);
+               snapc = ceph_get_snap_context(ci->i_head_snapc);
+       }
+       spin_unlock(&ci->i_ceph_lock);
+
+       req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 2,
+                       false, GFP_NOFS);
+       if (IS_ERR(req)) {
+               ret = PTR_ERR(req);
+               req = orig_req;
+               goto out;
+       }
+
+       req->r_flags =  CEPH_OSD_FLAG_ORDERSNAP |
+                       CEPH_OSD_FLAG_ONDISK |
+                       CEPH_OSD_FLAG_WRITE;
+       req->r_base_oloc = orig_req->r_base_oloc;
+       req->r_base_oid = orig_req->r_base_oid;
+
+       req->r_ops[0] = orig_req->r_ops[0];
+       osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
+
+       ceph_osdc_build_request(req, req->r_ops[0].extent.offset,
+                               snapc, CEPH_NOSNAP, &aio_req->mtime);
+
+       ceph_put_snap_context(snapc);
+       ceph_osdc_put_request(orig_req);
+
+       req->r_callback = ceph_aio_complete_req;
+       req->r_inode = inode;
+       req->r_priv = aio_req;
+
+       ret = ceph_osdc_start_request(req->r_osdc, req, false);
+out:
+       if (ret < 0) {
+               BUG_ON(ret == -EOLDSNAPC);
+               req->r_result = ret;
+               ceph_aio_complete_req(req, NULL);
+       }
+
+       kfree(aio_work);
+}
+
 /*
  * Write commit request unsafe callback, called to tell us when a
  * request is unsafe (that is, in flight--has been handed to the
@@ -612,16 +767,10 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
 }
 
 
-/*
- * Synchronous write, straight from __user pointer or user pages.
- *
- * If write spans object boundary, just do multiple writes.  (For a
- * correct atomic write, we should e.g. take write locks on all
- * objects, rollback on failure, etc.)
- */
 static ssize_t
-ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
-                      struct ceph_snap_context *snapc)
+ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
+                      struct ceph_snap_context *snapc,
+                      struct ceph_cap_flush **pcf)
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
@@ -630,44 +779,52 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
        struct ceph_vino vino;
        struct ceph_osd_request *req;
        struct page **pages;
-       int num_pages;
-       int written = 0;
+       struct ceph_aio_request *aio_req = NULL;
+       int num_pages = 0;
        int flags;
-       int check_caps = 0;
        int ret;
        struct timespec mtime = CURRENT_TIME;
-       size_t count = iov_iter_count(from);
+       size_t count = iov_iter_count(iter);
+       loff_t pos = iocb->ki_pos;
+       bool write = iov_iter_rw(iter) == WRITE;
 
-       if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
+       if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP)
                return -EROFS;
 
-       dout("sync_direct_write on file %p %lld~%u\n", file, pos,
-            (unsigned)count);
+       dout("sync_direct_read_write (%s) on file %p %lld~%u\n",
+            (write ? "write" : "read"), file, pos, (unsigned)count);
 
        ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
        if (ret < 0)
                return ret;
 
-       ret = invalidate_inode_pages2_range(inode->i_mapping,
-                                           pos >> PAGE_CACHE_SHIFT,
-                                           (pos + count) >> PAGE_CACHE_SHIFT);
-       if (ret < 0)
-               dout("invalidate_inode_pages2_range returned %d\n", ret);
+       if (write) {
+               ret = invalidate_inode_pages2_range(inode->i_mapping,
+                                       pos >> PAGE_CACHE_SHIFT,
+                                       (pos + count) >> PAGE_CACHE_SHIFT);
+               if (ret < 0)
+                       dout("invalidate_inode_pages2_range returned %d\n", ret);
 
-       flags = CEPH_OSD_FLAG_ORDERSNAP |
-               CEPH_OSD_FLAG_ONDISK |
-               CEPH_OSD_FLAG_WRITE;
+               flags = CEPH_OSD_FLAG_ORDERSNAP |
+                       CEPH_OSD_FLAG_ONDISK |
+                       CEPH_OSD_FLAG_WRITE;
+       } else {
+               flags = CEPH_OSD_FLAG_READ;
+       }
 
-       while (iov_iter_count(from) > 0) {
-               u64 len = dio_get_pagev_size(from);
-               size_t start;
-               ssize_t n;
+       while (iov_iter_count(iter) > 0) {
+               u64 size = dio_get_pagev_size(iter);
+               size_t start = 0;
+               ssize_t len;
 
                vino = ceph_vino(inode);
                req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
-                                           vino, pos, &len, 0,
-                                           2,/*include a 'startsync' command*/
-                                           CEPH_OSD_OP_WRITE, flags, snapc,
+                                           vino, pos, &size, 0,
+                                           /*include a 'startsync' command*/
+                                           write ? 2 : 1,
+                                           write ? CEPH_OSD_OP_WRITE :
+                                                   CEPH_OSD_OP_READ,
+                                           flags, snapc,
                                            ci->i_truncate_seq,
                                            ci->i_truncate_size,
                                            false);
@@ -676,10 +833,8 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
                        break;
                }
 
-               osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
-
-               n = len;
-               pages = dio_get_pages_alloc(from, len, &start, &num_pages);
+               len = size;
+               pages = dio_get_pages_alloc(iter, len, &start, &num_pages);
                if (IS_ERR(pages)) {
                        ceph_osdc_put_request(req);
                        ret = PTR_ERR(pages);
@@ -687,47 +842,128 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
                }
 
                /*
-                * throw out any page cache pages in this range. this
-                * may block.
+                * To simplify error handling, allow AIO when IO within i_size
+                * or IO can be satisfied by single OSD request.
                 */
-               truncate_inode_pages_range(inode->i_mapping, pos,
-                                  (pos+n) | (PAGE_CACHE_SIZE-1));
-               osd_req_op_extent_osd_data_pages(req, 0, pages, n, start,
-                                               false, false);
+               if (pos == iocb->ki_pos && !is_sync_kiocb(iocb) &&
+                   (len == count || pos + count <= i_size_read(inode))) {
+                       aio_req = kzalloc(sizeof(*aio_req), GFP_KERNEL);
+                       if (aio_req) {
+                               aio_req->iocb = iocb;
+                               aio_req->write = write;
+                               INIT_LIST_HEAD(&aio_req->osd_reqs);
+                               if (write) {
+                                       aio_req->mtime = mtime;
+                                       swap(aio_req->prealloc_cf, *pcf);
+                               }
+                       }
+                       /* ignore error */
+               }
+
+               if (write) {
+                       /*
+                        * throw out any page cache pages in this range. this
+                        * may block.
+                        */
+                       truncate_inode_pages_range(inode->i_mapping, pos,
+                                       (pos+len) | (PAGE_CACHE_SIZE - 1));
+
+                       osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
+               }
+
+
+               osd_req_op_extent_osd_data_pages(req, 0, pages, len, start,
+                                                false, false);
 
-               /* BUG_ON(vino.snap != CEPH_NOSNAP); */
                ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
 
-               ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+               if (aio_req) {
+                       aio_req->total_len += len;
+                       aio_req->num_reqs++;
+                       atomic_inc(&aio_req->pending_reqs);
+
+                       req->r_callback = ceph_aio_complete_req;
+                       req->r_inode = inode;
+                       req->r_priv = aio_req;
+                       list_add_tail(&req->r_unsafe_item, &aio_req->osd_reqs);
+
+                       pos += len;
+                       iov_iter_advance(iter, len);
+                       continue;
+               }
+
+               ret = ceph_osdc_start_request(req->r_osdc, req, false);
                if (!ret)
                        ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
 
+               size = i_size_read(inode);
+               if (!write) {
+                       if (ret == -ENOENT)
+                               ret = 0;
+                       if (ret >= 0 && ret < len && pos + ret < size) {
+                               int zlen = min_t(size_t, len - ret,
+                                                size - pos - ret);
+                               ceph_zero_page_vector_range(start + ret, zlen,
+                                                           pages);
+                               ret += zlen;
+                       }
+                       if (ret >= 0)
+                               len = ret;
+               }
+
                ceph_put_page_vector(pages, num_pages, false);
 
                ceph_osdc_put_request(req);
-               if (ret)
+               if (ret < 0)
                        break;
-               pos += n;
-               written += n;
-               iov_iter_advance(from, n);
 
-               if (pos > i_size_read(inode)) {
-                       check_caps = ceph_inode_set_size(inode, pos);
-                       if (check_caps)
+               pos += len;
+               iov_iter_advance(iter, len);
+
+               if (!write && pos >= size)
+                       break;
+
+               if (write && pos > size) {
+                       if (ceph_inode_set_size(inode, pos))
                                ceph_check_caps(ceph_inode(inode),
                                                CHECK_CAPS_AUTHONLY,
                                                NULL);
                }
        }
 
-       if (ret != -EOLDSNAPC && written > 0) {
+       if (aio_req) {
+               if (aio_req->num_reqs == 0) {
+                       kfree(aio_req);
+                       return ret;
+               }
+
+               ceph_get_cap_refs(ci, write ? CEPH_CAP_FILE_WR :
+                                             CEPH_CAP_FILE_RD);
+
+               while (!list_empty(&aio_req->osd_reqs)) {
+                       req = list_first_entry(&aio_req->osd_reqs,
+                                              struct ceph_osd_request,
+                                              r_unsafe_item);
+                       list_del_init(&req->r_unsafe_item);
+                       if (ret >= 0)
+                               ret = ceph_osdc_start_request(req->r_osdc,
+                                                             req, false);
+                       if (ret < 0) {
+                               BUG_ON(ret == -EOLDSNAPC);
+                               req->r_result = ret;
+                               ceph_aio_complete_req(req, NULL);
+                       }
+               }
+               return -EIOCBQUEUED;
+       }
+
+       if (ret != -EOLDSNAPC && pos > iocb->ki_pos) {
+               ret = pos - iocb->ki_pos;
                iocb->ki_pos = pos;
-               ret = written;
        }
        return ret;
 }
 
-
 /*
  * Synchronous write, straight from __user pointer or user pages.
  *
@@ -897,8 +1133,14 @@ again:
                     ceph_cap_string(got));
 
                if (ci->i_inline_version == CEPH_INLINE_NONE) {
-                       /* hmm, this isn't really async... */
-                       ret = ceph_sync_read(iocb, to, &retry_op);
+                       if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) {
+                               ret = ceph_direct_read_write(iocb, to,
+                                                            NULL, NULL);
+                               if (ret >= 0 && ret < len)
+                                       retry_op = CHECK_EOF;
+                       } else {
+                               ret = ceph_sync_read(iocb, to, &retry_op);
+                       }
                } else {
                        retry_op = READ_INLINE;
                }
@@ -916,7 +1158,7 @@ again:
                pinned_page = NULL;
        }
        ceph_put_cap_refs(ci, got);
-       if (retry_op && ret >= 0) {
+       if (retry_op > HAVE_RETRIED && ret >= 0) {
                int statret;
                struct page *page = NULL;
                loff_t i_size;
@@ -968,12 +1210,11 @@ again:
                if (retry_op == CHECK_EOF && iocb->ki_pos < i_size &&
                    ret < len) {
                        dout("sync_read hit hole, ppos %lld < size %lld"
-                            ", reading more\n", iocb->ki_pos,
-                            inode->i_size);
+                            ", reading more\n", iocb->ki_pos, i_size);
 
                        read += ret;
                        len -= ret;
-                       retry_op = 0;
+                       retry_op = HAVE_RETRIED;
                        goto again;
                }
        }
@@ -1014,7 +1255,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
        if (!prealloc_cf)
                return -ENOMEM;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        /* We can write back this queue in page reclaim */
        current->backing_dev_info = inode_to_bdi(inode);
@@ -1052,7 +1293,7 @@ retry_snap:
        }
 
        dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
-            inode, ceph_vinop(inode), pos, count, inode->i_size);
+            inode, ceph_vinop(inode), pos, count, i_size_read(inode));
        if (fi->fmode & CEPH_FILE_MODE_LAZY)
                want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
        else
@@ -1070,7 +1311,7 @@ retry_snap:
            (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) {
                struct ceph_snap_context *snapc;
                struct iov_iter data;
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
 
                spin_lock(&ci->i_ceph_lock);
                if (__ceph_have_pending_cap_snap(ci)) {
@@ -1088,8 +1329,8 @@ retry_snap:
                /* we might need to revert back to that point */
                data = *from;
                if (iocb->ki_flags & IOCB_DIRECT)
-                       written = ceph_sync_direct_write(iocb, &data, pos,
-                                                        snapc);
+                       written = ceph_direct_read_write(iocb, &data, snapc,
+                                                        &prealloc_cf);
                else
                        written = ceph_sync_write(iocb, &data, pos, snapc);
                if (written == -EOLDSNAPC) {
@@ -1097,14 +1338,14 @@ retry_snap:
                                "got EOLDSNAPC, retrying\n",
                                inode, ceph_vinop(inode),
                                pos, (unsigned)count);
-                       mutex_lock(&inode->i_mutex);
+                       inode_lock(inode);
                        goto retry_snap;
                }
                if (written > 0)
                        iov_iter_advance(from, written);
                ceph_put_snap_context(snapc);
        } else {
-               loff_t old_size = inode->i_size;
+               loff_t old_size = i_size_read(inode);
                /*
                 * No need to acquire the i_truncate_mutex. Because
                 * the MDS revokes Fwb caps before sending truncate
@@ -1115,9 +1356,9 @@ retry_snap:
                written = generic_perform_write(file, from, pos);
                if (likely(written >= 0))
                        iocb->ki_pos = pos + written;
-               if (inode->i_size > old_size)
+               if (i_size_read(inode) > old_size)
                        ceph_fscache_update_objectsize(inode);
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
        }
 
        if (written >= 0) {
@@ -1147,7 +1388,7 @@ retry_snap:
        goto out_unlocked;
 
 out:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 out_unlocked:
        ceph_free_cap_flush(prealloc_cf);
        current->backing_dev_info = NULL;
@@ -1160,9 +1401,10 @@ out_unlocked:
 static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
 {
        struct inode *inode = file->f_mapping->host;
+       loff_t i_size;
        int ret;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
                ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
@@ -1172,9 +1414,10 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
                }
        }
 
+       i_size = i_size_read(inode);
        switch (whence) {
        case SEEK_END:
-               offset += inode->i_size;
+               offset += i_size;
                break;
        case SEEK_CUR:
                /*
@@ -1190,24 +1433,24 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
                offset += file->f_pos;
                break;
        case SEEK_DATA:
-               if (offset >= inode->i_size) {
+               if (offset >= i_size) {
                        ret = -ENXIO;
                        goto out;
                }
                break;
        case SEEK_HOLE:
-               if (offset >= inode->i_size) {
+               if (offset >= i_size) {
                        ret = -ENXIO;
                        goto out;
                }
-               offset = inode->i_size;
+               offset = i_size;
                break;
        }
 
        offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
 
 out:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return offset;
 }
 
@@ -1363,7 +1606,7 @@ static long ceph_fallocate(struct file *file, int mode,
        if (!prealloc_cf)
                return -ENOMEM;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        if (ceph_snap(inode) != CEPH_NOSNAP) {
                ret = -EROFS;
@@ -1418,7 +1661,7 @@ static long ceph_fallocate(struct file *file, int mode,
 
        ceph_put_cap_refs(ci, got);
 unlock:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        ceph_free_cap_flush(prealloc_cf);
        return ret;
 }
index da55eb8bcffab89755baf5229b92ededf49dd484..fb4ba2e4e2a5fa5c5d62afa3b94f758c7906687b 100644 (file)
@@ -548,7 +548,7 @@ int ceph_fill_file_size(struct inode *inode, int issued,
        if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
            (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
                dout("size %lld -> %llu\n", inode->i_size, size);
-               inode->i_size = size;
+               i_size_write(inode, size);
                inode->i_blocks = (size + (1<<9) - 1) >> 9;
                ci->i_reported_size = size;
                if (truncate_seq != ci->i_truncate_seq) {
@@ -808,7 +808,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
                        spin_unlock(&ci->i_ceph_lock);
 
                        err = -EINVAL;
-                       if (WARN_ON(symlen != inode->i_size))
+                       if (WARN_ON(symlen != i_size_read(inode)))
                                goto out;
 
                        err = -ENOMEM;
@@ -1549,7 +1549,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size)
 
        spin_lock(&ci->i_ceph_lock);
        dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
-       inode->i_size = size;
+       i_size_write(inode, size);
        inode->i_blocks = (size + (1 << 9) - 1) >> 9;
 
        /* tell the MDS if we are approaching max_size */
@@ -1911,7 +1911,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
                     inode->i_size, attr->ia_size);
                if ((issued & CEPH_CAP_FILE_EXCL) &&
                    attr->ia_size > inode->i_size) {
-                       inode->i_size = attr->ia_size;
+                       i_size_write(inode, attr->ia_size);
                        inode->i_blocks =
                                (attr->ia_size + (1 << 9) - 1) >> 9;
                        inode->i_ctime = attr->ia_ctime;
index 7febcf2475c5ab675c04dfd2fddaa3ed574522a0..50b268483302922ee63d205a2fa560ec27676eed 100644 (file)
@@ -50,7 +50,7 @@ void cifs_vfs_err(const char *fmt, ...)
        vaf.fmt = fmt;
        vaf.va = &args;
 
-       pr_err("CIFS VFS: %pV", &vaf);
+       pr_err_ratelimited("CIFS VFS: %pV", &vaf);
 
        va_end(args);
 }
index f40fbaca1b2a2c1d7457d71bd1a5e7dd4b477859..66cf0f9fff8984cb12eed1c89d91bc9c6ccc6a9d 100644 (file)
@@ -51,14 +51,13 @@ __printf(1, 2) void cifs_vfs_err(const char *fmt, ...);
 /* information message: e.g., configuration, major event */
 #define cifs_dbg(type, fmt, ...)                                       \
 do {                                                                   \
-       if (type == FYI) {                                              \
-               if (cifsFYI & CIFS_INFO) {                              \
-                       pr_debug("%s: " fmt, __FILE__, ##__VA_ARGS__);  \
-               }                                                       \
+       if (type == FYI && cifsFYI & CIFS_INFO) {                       \
+               pr_debug_ratelimited("%s: "                             \
+                           fmt, __FILE__, ##__VA_ARGS__);              \
        } else if (type == VFS) {                                       \
                cifs_vfs_err(fmt, ##__VA_ARGS__);                       \
        } else if (type == NOISY && type != 0) {                        \
-               pr_debug(fmt, ##__VA_ARGS__);                           \
+               pr_debug_ratelimited(fmt, ##__VA_ARGS__);               \
        }                                                               \
 } while (0)
 
index c4c1169814b21d15463410ca474ef8a8608c0ed0..c48ca13673e37f2eba7c4045f140e6e8b740e499 100644 (file)
@@ -507,6 +507,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
 
        seq_printf(s, ",rsize=%u", cifs_sb->rsize);
        seq_printf(s, ",wsize=%u", cifs_sb->wsize);
+       seq_printf(s, ",echo_interval=%lu",
+                       tcon->ses->server->echo_interval / HZ);
        /* convert actimeo and display it in seconds */
        seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ);
 
@@ -640,9 +642,9 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
                while (*s && *s != sep)
                        s++;
 
-               mutex_lock(&dir->i_mutex);
+               inode_lock(dir);
                child = lookup_one_len(p, dentry, s - p);
-               mutex_unlock(&dir->i_mutex);
+               inode_unlock(dir);
                dput(dentry);
                dentry = child;
        } while (!IS_ERR(dentry));
@@ -752,6 +754,9 @@ cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter)
        ssize_t rc;
        struct inode *inode = file_inode(iocb->ki_filp);
 
+       if (iocb->ki_filp->f_flags & O_DIRECT)
+               return cifs_user_readv(iocb, iter);
+
        rc = cifs_revalidate_mapping(inode);
        if (rc)
                return rc;
@@ -766,6 +771,18 @@ static ssize_t cifs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
        ssize_t written;
        int rc;
 
+       if (iocb->ki_filp->f_flags & O_DIRECT) {
+               written = cifs_user_writev(iocb, from);
+               if (written > 0 && CIFS_CACHE_READ(cinode)) {
+                       cifs_zap_mapping(inode);
+                       cifs_dbg(FYI,
+                                "Set no oplock for inode=%p after a write operation\n",
+                                inode);
+                       cinode->oplock = 0;
+               }
+               return written;
+       }
+
        written = cifs_get_writer(cinode);
        if (written)
                return written;
index 2b510c537a0d588c532fe9200cdd4a9b03a4c9f4..a25b2513f1460608596f9c9fa7a9cb8e8383e5f2 100644 (file)
 #define SERVER_NAME_LENGTH 40
 #define SERVER_NAME_LEN_WITH_NULL     (SERVER_NAME_LENGTH + 1)
 
-/* SMB echo "timeout" -- FIXME: tunable? */
-#define SMB_ECHO_INTERVAL (60 * HZ)
+/* echo interval in seconds */
+#define SMB_ECHO_INTERVAL_MIN 1
+#define SMB_ECHO_INTERVAL_MAX 600
+#define SMB_ECHO_INTERVAL_DEFAULT 60
 
 #include "cifspdu.h"
 
@@ -225,7 +227,7 @@ struct smb_version_operations {
        void (*print_stats)(struct seq_file *m, struct cifs_tcon *);
        void (*dump_share_caps)(struct seq_file *, struct cifs_tcon *);
        /* verify the message */
-       int (*check_message)(char *, unsigned int);
+       int (*check_message)(char *, unsigned int, struct TCP_Server_Info *);
        bool (*is_oplock_break)(char *, struct TCP_Server_Info *);
        void (*downgrade_oplock)(struct TCP_Server_Info *,
                                        struct cifsInodeInfo *, bool);
@@ -507,6 +509,7 @@ struct smb_vol {
        struct sockaddr_storage dstaddr; /* destination address */
        struct sockaddr_storage srcaddr; /* allow binding to a local IP */
        struct nls_table *local_nls;
+       unsigned int echo_interval; /* echo interval in secs */
 };
 
 #define CIFS_MOUNT_MASK (CIFS_MOUNT_NO_PERM | CIFS_MOUNT_SET_UID | \
@@ -627,7 +630,9 @@ struct TCP_Server_Info {
 #ifdef CONFIG_CIFS_SMB2
        unsigned int    max_read;
        unsigned int    max_write;
+       __u8            preauth_hash[512];
 #endif /* CONFIG_CIFS_SMB2 */
+       unsigned long echo_interval;
 };
 
 static inline unsigned int
@@ -809,7 +814,10 @@ struct cifs_ses {
        bool need_reconnect:1; /* connection reset, uid now invalid */
 #ifdef CONFIG_CIFS_SMB2
        __u16 session_flags;
-       char smb3signingkey[SMB3_SIGN_KEY_SIZE]; /* for signing smb3 packets */
+       __u8 smb3signingkey[SMB3_SIGN_KEY_SIZE];
+       __u8 smb3encryptionkey[SMB3_SIGN_KEY_SIZE];
+       __u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE];
+       __u8 preauth_hash[512];
 #endif /* CONFIG_CIFS_SMB2 */
 };
 
index c63fd1dde25b861b011f604522572c5619f177f1..eed7ff50faf016e6714867542953334b199e7627 100644 (file)
@@ -102,7 +102,7 @@ extern int SendReceiveBlockingLock(const unsigned int xid,
                        struct smb_hdr *out_buf,
                        int *bytes_returned);
 extern int cifs_reconnect(struct TCP_Server_Info *server);
-extern int checkSMB(char *buf, unsigned int length);
+extern int checkSMB(char *buf, unsigned int len, struct TCP_Server_Info *srvr);
 extern bool is_valid_oplock_break(char *, struct TCP_Server_Info *);
 extern bool backup_cred(struct cifs_sb_info *);
 extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
@@ -439,7 +439,8 @@ extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *);
 extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *);
 extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
 extern int calc_seckey(struct cifs_ses *);
-extern int generate_smb3signingkey(struct cifs_ses *);
+extern int generate_smb30signingkey(struct cifs_ses *);
+extern int generate_smb311signingkey(struct cifs_ses *);
 
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 extern int calc_lanman_hash(const char *password, const char *cryptkey,
index ecb0803bdb0e50479f50b0e874a4e87cb064e2d0..4fbd92d2e113e69d5e3105235a215a52b2e756e7 100644 (file)
@@ -95,6 +95,7 @@ enum {
        Opt_cruid, Opt_gid, Opt_file_mode,
        Opt_dirmode, Opt_port,
        Opt_rsize, Opt_wsize, Opt_actimeo,
+       Opt_echo_interval,
 
        /* Mount options which take string value */
        Opt_user, Opt_pass, Opt_ip,
@@ -188,6 +189,7 @@ static const match_table_t cifs_mount_option_tokens = {
        { Opt_rsize, "rsize=%s" },
        { Opt_wsize, "wsize=%s" },
        { Opt_actimeo, "actimeo=%s" },
+       { Opt_echo_interval, "echo_interval=%s" },
 
        { Opt_blank_user, "user=" },
        { Opt_blank_user, "username=" },
@@ -368,7 +370,6 @@ cifs_reconnect(struct TCP_Server_Info *server)
        server->session_key.response = NULL;
        server->session_key.len = 0;
        server->lstrp = jiffies;
-       mutex_unlock(&server->srv_mutex);
 
        /* mark submitted MIDs for retry and issue callback */
        INIT_LIST_HEAD(&retry_list);
@@ -381,6 +382,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
                list_move(&mid_entry->qhead, &retry_list);
        }
        spin_unlock(&GlobalMid_Lock);
+       mutex_unlock(&server->srv_mutex);
 
        cifs_dbg(FYI, "%s: issuing mid callbacks\n", __func__);
        list_for_each_safe(tmp, tmp2, &retry_list) {
@@ -418,6 +420,7 @@ cifs_echo_request(struct work_struct *work)
        int rc;
        struct TCP_Server_Info *server = container_of(work,
                                        struct TCP_Server_Info, echo.work);
+       unsigned long echo_interval = server->echo_interval;
 
        /*
         * We cannot send an echo if it is disabled or until the
@@ -427,7 +430,7 @@ cifs_echo_request(struct work_struct *work)
         */
        if (!server->ops->need_neg || server->ops->need_neg(server) ||
            (server->ops->can_echo && !server->ops->can_echo(server)) ||
-           time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ))
+           time_before(jiffies, server->lstrp + echo_interval - HZ))
                goto requeue_echo;
 
        rc = server->ops->echo ? server->ops->echo(server) : -ENOSYS;
@@ -436,7 +439,7 @@ cifs_echo_request(struct work_struct *work)
                         server->hostname);
 
 requeue_echo:
-       queue_delayed_work(cifsiod_wq, &server->echo, SMB_ECHO_INTERVAL);
+       queue_delayed_work(cifsiod_wq, &server->echo, echo_interval);
 }
 
 static bool
@@ -487,9 +490,9 @@ server_unresponsive(struct TCP_Server_Info *server)
         *     a response in >60s.
         */
        if (server->tcpStatus == CifsGood &&
-           time_after(jiffies, server->lstrp + 2 * SMB_ECHO_INTERVAL)) {
-               cifs_dbg(VFS, "Server %s has not responded in %d seconds. Reconnecting...\n",
-                        server->hostname, (2 * SMB_ECHO_INTERVAL) / HZ);
+           time_after(jiffies, server->lstrp + 2 * server->echo_interval)) {
+               cifs_dbg(VFS, "Server %s has not responded in %lu seconds. Reconnecting...\n",
+                        server->hostname, (2 * server->echo_interval) / HZ);
                cifs_reconnect(server);
                wake_up(&server->response_q);
                return true;
@@ -828,7 +831,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
         * 48 bytes is enough to display the header and a little bit
         * into the payload for debugging purposes.
         */
-       length = server->ops->check_message(buf, server->total_read);
+       length = server->ops->check_message(buf, server->total_read, server);
        if (length != 0)
                cifs_dump_mem("Bad SMB: ", buf,
                        min_t(unsigned int, server->total_read, 48));
@@ -1624,6 +1627,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
                                goto cifs_parse_mount_err;
                        }
                        break;
+               case Opt_echo_interval:
+                       if (get_option_ul(args, &option)) {
+                               cifs_dbg(VFS, "%s: Invalid echo interval value\n",
+                                        __func__);
+                               goto cifs_parse_mount_err;
+                       }
+                       vol->echo_interval = option;
+                       break;
 
                /* String Arguments */
 
@@ -2089,6 +2100,9 @@ static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol)
        if (!match_security(server, vol))
                return 0;
 
+       if (server->echo_interval != vol->echo_interval)
+               return 0;
+
        return 1;
 }
 
@@ -2208,6 +2222,12 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
        tcp_ses->tcpStatus = CifsNew;
        ++tcp_ses->srv_count;
 
+       if (volume_info->echo_interval >= SMB_ECHO_INTERVAL_MIN &&
+               volume_info->echo_interval <= SMB_ECHO_INTERVAL_MAX)
+               tcp_ses->echo_interval = volume_info->echo_interval * HZ;
+       else
+               tcp_ses->echo_interval = SMB_ECHO_INTERVAL_DEFAULT * HZ;
+
        rc = ip_connect(tcp_ses);
        if (rc < 0) {
                cifs_dbg(VFS, "Error connecting to socket. Aborting operation.\n");
@@ -2237,7 +2257,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
        cifs_fscache_get_client_cookie(tcp_ses);
 
        /* queue echo request delayed work */
-       queue_delayed_work(cifsiod_wq, &tcp_ses->echo, SMB_ECHO_INTERVAL);
+       queue_delayed_work(cifsiod_wq, &tcp_ses->echo, tcp_ses->echo_interval);
 
        return tcp_ses;
 
index 0a2752b79e72cc2b7a083894843a8b3ae1dea23d..ff882aeaccc67c404a289fff472a26d8537e1128 100644 (file)
@@ -2267,7 +2267,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end,
        rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
        if (rc)
                return rc;
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        xid = get_xid();
 
@@ -2292,7 +2292,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end,
        }
 
        free_xid(xid);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return rc;
 }
 
@@ -2309,7 +2309,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
        rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
        if (rc)
                return rc;
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        xid = get_xid();
 
@@ -2326,7 +2326,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
        }
 
        free_xid(xid);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return rc;
 }
 
@@ -2672,7 +2672,7 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from)
         * with a brlock that prevents writing.
         */
        down_read(&cinode->lock_sem);
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        rc = generic_write_checks(iocb, from);
        if (rc <= 0)
@@ -2685,7 +2685,7 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from)
        else
                rc = -EACCES;
 out:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        if (rc > 0) {
                ssize_t err = generic_write_sync(file, iocb->ki_pos - rc, rc);
index a329f5ba35aad8649ce3f644b9b11913c57cc5f8..aeb26dbfa1bf2dbfb2d111316064d240ef807336 100644 (file)
@@ -814,8 +814,21 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
                        }
                } else
                        fattr.cf_uniqueid = iunique(sb, ROOT_I);
-       } else
-               fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid;
+       } else {
+               if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) &&
+                   validinum == false && server->ops->get_srv_inum) {
+                       /*
+                        * Pass a NULL tcon to ensure we don't make a round
+                        * trip to the server. This only works for SMB2+.
+                        */
+                       tmprc = server->ops->get_srv_inum(xid,
+                               NULL, cifs_sb, full_path,
+                               &fattr.cf_uniqueid, data);
+                       if (tmprc)
+                               fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid;
+               } else
+                       fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid;
+       }
 
        /* query for SFU type info if supported and needed */
        if (fattr.cf_cifsattrs & ATTR_SYSTEM &&
@@ -856,6 +869,13 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
        } else {
                /* we already have inode, update it */
 
+               /* if uniqueid is different, return error */
+               if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM &&
+                   CIFS_I(*inode)->uniqueid != fattr.cf_uniqueid)) {
+                       rc = -ESTALE;
+                       goto cgii_exit;
+               }
+
                /* if filetype is different, return error */
                if (unlikely(((*inode)->i_mode & S_IFMT) !=
                    (fattr.cf_mode & S_IFMT))) {
index 8442b8b8e0be145e7f68055e0025f1909442d156..813fe13c2ae175869cdc6db06a19392d09d7f05f 100644 (file)
@@ -310,7 +310,7 @@ check_smb_hdr(struct smb_hdr *smb)
 }
 
 int
-checkSMB(char *buf, unsigned int total_read)
+checkSMB(char *buf, unsigned int total_read, struct TCP_Server_Info *server)
 {
        struct smb_hdr *smb = (struct smb_hdr *)buf;
        __u32 rfclen = be32_to_cpu(smb->smb_buf_length);
index 0557c45e9c3308a368f7ff3e73f9423c81eedea4..b30a4a6d98a0f002c235bf3128012aff06ba2bc3 100644 (file)
@@ -847,6 +847,7 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
                 * if buggy server returns . and .. late do we want to
                 * check for that here?
                 */
+               *tmp_buf = 0;
                rc = cifs_filldir(current_entry, file, ctx,
                                  tmp_buf, max_len);
                if (rc) {
index 1c5907019045517ac302daf4c04e8caea21f1fda..389fb9f8c84e22308ac5fcb44c27f005bee6e27c 100644 (file)
@@ -38,7 +38,7 @@ check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid)
         * Make sure that this really is an SMB, that it is a response,
         * and that the message ids match.
         */
-       if ((*(__le32 *)hdr->ProtocolId == SMB2_PROTO_NUMBER) &&
+       if ((hdr->ProtocolId == SMB2_PROTO_NUMBER) &&
            (mid == wire_mid)) {
                if (hdr->Flags & SMB2_FLAGS_SERVER_TO_REDIR)
                        return 0;
@@ -50,9 +50,9 @@ check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid)
                                cifs_dbg(VFS, "Received Request not response\n");
                }
        } else { /* bad signature or mid */
-               if (*(__le32 *)hdr->ProtocolId != SMB2_PROTO_NUMBER)
+               if (hdr->ProtocolId != SMB2_PROTO_NUMBER)
                        cifs_dbg(VFS, "Bad protocol string signature header %x\n",
-                                *(unsigned int *) hdr->ProtocolId);
+                                le32_to_cpu(hdr->ProtocolId));
                if (mid != wire_mid)
                        cifs_dbg(VFS, "Mids do not match: %llu and %llu\n",
                                 mid, wire_mid);
@@ -93,11 +93,11 @@ static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = {
 };
 
 int
-smb2_check_message(char *buf, unsigned int length)
+smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr)
 {
        struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
        struct smb2_pdu *pdu = (struct smb2_pdu *)hdr;
-       __u64 mid = le64_to_cpu(hdr->MessageId);
+       __u64 mid;
        __u32 len = get_rfc1002_length(buf);
        __u32 clc_len;  /* calculated length */
        int command;
@@ -111,6 +111,30 @@ smb2_check_message(char *buf, unsigned int length)
         * ie Validate the wct via smb2_struct_sizes table above
         */
 
+       if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) {
+               struct smb2_transform_hdr *thdr =
+                       (struct smb2_transform_hdr *)buf;
+               struct cifs_ses *ses = NULL;
+               struct list_head *tmp;
+
+               /* decrypt frame now that it is completely read in */
+               spin_lock(&cifs_tcp_ses_lock);
+               list_for_each(tmp, &srvr->smb_ses_list) {
+                       ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
+                       if (ses->Suid == thdr->SessionId)
+                               break;
+
+                       ses = NULL;
+               }
+               spin_unlock(&cifs_tcp_ses_lock);
+               if (ses == NULL) {
+                       cifs_dbg(VFS, "no decryption - session id not found\n");
+                       return 1;
+               }
+       }
+
+
+       mid = le64_to_cpu(hdr->MessageId);
        if (length < sizeof(struct smb2_pdu)) {
                if ((length >= sizeof(struct smb2_hdr)) && (hdr->Status != 0)) {
                        pdu->StructureSize2 = 0;
@@ -322,7 +346,7 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
 
        /* return pointer to beginning of data area, ie offset from SMB start */
        if ((*off != 0) && (*len != 0))
-               return (char *)(&hdr->ProtocolId[0]) + *off;
+               return (char *)(&hdr->ProtocolId) + *off;
        else
                return NULL;
 }
index 53ccdde6ff187c16084e10dd07372d6516d25139..3525ed756173d40009017ca11c9ae04cbd97f34a 100644 (file)
@@ -182,6 +182,11 @@ smb2_find_mid(struct TCP_Server_Info *server, char *buf)
        struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
        __u64 wire_mid = le64_to_cpu(hdr->MessageId);
 
+       if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) {
+               cifs_dbg(VFS, "encrypted frame parsing not supported yet");
+               return NULL;
+       }
+
        spin_lock(&GlobalMid_Lock);
        list_for_each_entry(mid, &server->pending_mid_q, qhead) {
                if ((mid->mid == wire_mid) &&
@@ -1692,7 +1697,7 @@ struct smb_version_operations smb30_operations = {
        .get_lease_key = smb2_get_lease_key,
        .set_lease_key = smb2_set_lease_key,
        .new_lease_key = smb2_new_lease_key,
-       .generate_signingkey = generate_smb3signingkey,
+       .generate_signingkey = generate_smb30signingkey,
        .calc_signature = smb3_calc_signature,
        .set_integrity  = smb3_set_integrity,
        .is_read_op = smb21_is_read_op,
@@ -1779,7 +1784,7 @@ struct smb_version_operations smb311_operations = {
        .get_lease_key = smb2_get_lease_key,
        .set_lease_key = smb2_set_lease_key,
        .new_lease_key = smb2_new_lease_key,
-       .generate_signingkey = generate_smb3signingkey,
+       .generate_signingkey = generate_smb311signingkey,
        .calc_signature = smb3_calc_signature,
        .set_integrity  = smb3_set_integrity,
        .is_read_op = smb21_is_read_op,
@@ -1838,7 +1843,7 @@ struct smb_version_values smb21_values = {
 struct smb_version_values smb30_values = {
        .version_string = SMB30_VERSION_STRING,
        .protocol_id = SMB30_PROT_ID,
-       .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES,
+       .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION,
        .large_lock_type = 0,
        .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
        .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
@@ -1858,7 +1863,7 @@ struct smb_version_values smb30_values = {
 struct smb_version_values smb302_values = {
        .version_string = SMB302_VERSION_STRING,
        .protocol_id = SMB302_PROT_ID,
-       .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES,
+       .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION,
        .large_lock_type = 0,
        .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
        .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
index 767555518d40ccd357b6f5540f2758bb8be6e654..10f8d5cf5681e26b843c0ac15821eab69e6eeb22 100644 (file)
@@ -97,10 +97,7 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ ,
        hdr->smb2_buf_length = cpu_to_be32(parmsize + sizeof(struct smb2_hdr)
                        - 4 /*  RFC 1001 length field itself not counted */);
 
-       hdr->ProtocolId[0] = 0xFE;
-       hdr->ProtocolId[1] = 'S';
-       hdr->ProtocolId[2] = 'M';
-       hdr->ProtocolId[3] = 'B';
+       hdr->ProtocolId = SMB2_PROTO_NUMBER;
        hdr->StructureSize = cpu_to_le16(64);
        hdr->Command = smb2_cmd;
        hdr->CreditRequest = cpu_to_le16(2); /* BB make this dynamic */
@@ -1573,7 +1570,8 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
                goto ioctl_exit;
        }
 
-       memcpy(*out_data, rsp->hdr.ProtocolId + le32_to_cpu(rsp->OutputOffset),
+       memcpy(*out_data,
+              (char *)&rsp->hdr.ProtocolId + le32_to_cpu(rsp->OutputOffset),
               *plen);
 ioctl_exit:
        free_rsp_buf(resp_buftype, rsp);
@@ -2093,7 +2091,7 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
        }
 
        if (*buf) {
-               memcpy(*buf, (char *)rsp->hdr.ProtocolId + rsp->DataOffset,
+               memcpy(*buf, (char *)&rsp->hdr.ProtocolId + rsp->DataOffset,
                       *nbytes);
                free_rsp_buf(resp_buftype, iov[0].iov_base);
        } else if (resp_buftype != CIFS_NO_BUFFER) {
index 4af52780ec350323cd41e993389e7214fb50f97d..ff88d9feb01e7475f75a230c004d5f40a80f14f9 100644 (file)
@@ -86,6 +86,7 @@
 #define MAX_SMB2_HDR_SIZE 0x78 /* 4 len + 64 hdr + (2*24 wct) + 2 bct + 2 pad */
 
 #define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe)
+#define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd)
 
 /*
  * SMB2 Header Definition
@@ -102,7 +103,7 @@ struct smb2_hdr {
        __be32 smb2_buf_length; /* big endian on wire */
                                /* length is only two or three bytes - with
                                 one or two byte type preceding it that MBZ */
-       __u8   ProtocolId[4];   /* 0xFE 'S' 'M' 'B' */
+       __le32 ProtocolId;      /* 0xFE 'S' 'M' 'B' */
        __le16 StructureSize;   /* 64 */
        __le16 CreditCharge;    /* MBZ */
        __le32 Status;          /* Error from server */
@@ -128,11 +129,10 @@ struct smb2_transform_hdr {
                                 one or two byte type preceding it that MBZ */
        __u8   ProtocolId[4];   /* 0xFD 'S' 'M' 'B' */
        __u8   Signature[16];
-       __u8   Nonce[11];
-       __u8   Reserved[5];
+       __u8   Nonce[16];
        __le32 OriginalMessageSize;
        __u16  Reserved1;
-       __le16 EncryptionAlgorithm;
+       __le16 Flags; /* EncryptionAlgorithm */
        __u64  SessionId;
 } __packed;
 
index 79dc650c18b26baf2e9a4ca8ac2c67df5ebdc63e..4f07dc93608db3fc8723f53d86219087b7cc76c8 100644 (file)
@@ -34,7 +34,8 @@ struct smb_rqst;
  *****************************************************************
  */
 extern int map_smb2_to_linux_error(char *buf, bool log_err);
-extern int smb2_check_message(char *buf, unsigned int length);
+extern int smb2_check_message(char *buf, unsigned int length,
+                             struct TCP_Server_Info *server);
 extern unsigned int smb2_calc_size(void *buf);
 extern char *smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr);
 extern __le16 *cifs_convert_path_to_utf16(const char *from,
index d4c5b6f109a7feaa6f2c99f21ca332ff41a2673f..8732a43b10084bf787a8410498e3ec51c69ea93c 100644 (file)
@@ -222,8 +222,8 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
        return rc;
 }
 
-int
-generate_smb3signingkey(struct cifs_ses *ses)
+static int generate_key(struct cifs_ses *ses, struct kvec label,
+                       struct kvec context, __u8 *key, unsigned int key_size)
 {
        unsigned char zero = 0x0;
        __u8 i[4] = {0, 0, 0, 1};
@@ -233,7 +233,7 @@ generate_smb3signingkey(struct cifs_ses *ses)
        unsigned char *hashptr = prfhash;
 
        memset(prfhash, 0x0, SMB2_HMACSHA256_SIZE);
-       memset(ses->smb3signingkey, 0x0, SMB3_SIGNKEY_SIZE);
+       memset(key, 0x0, key_size);
 
        rc = smb3_crypto_shash_allocate(ses->server);
        if (rc) {
@@ -262,7 +262,7 @@ generate_smb3signingkey(struct cifs_ses *ses)
        }
 
        rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash,
-                               "SMB2AESCMAC", 12);
+                               label.iov_base, label.iov_len);
        if (rc) {
                cifs_dbg(VFS, "%s: Could not update with label\n", __func__);
                goto smb3signkey_ret;
@@ -276,7 +276,7 @@ generate_smb3signingkey(struct cifs_ses *ses)
        }
 
        rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash,
-                               "SmbSign", 8);
+                               context.iov_base, context.iov_len);
        if (rc) {
                cifs_dbg(VFS, "%s: Could not update with context\n", __func__);
                goto smb3signkey_ret;
@@ -296,12 +296,102 @@ generate_smb3signingkey(struct cifs_ses *ses)
                goto smb3signkey_ret;
        }
 
-       memcpy(ses->smb3signingkey, hashptr, SMB3_SIGNKEY_SIZE);
+       memcpy(key, hashptr, key_size);
 
 smb3signkey_ret:
        return rc;
 }
 
+struct derivation {
+       struct kvec label;
+       struct kvec context;
+};
+
+struct derivation_triplet {
+       struct derivation signing;
+       struct derivation encryption;
+       struct derivation decryption;
+};
+
+static int
+generate_smb3signingkey(struct cifs_ses *ses,
+                       const struct derivation_triplet *ptriplet)
+{
+       int rc;
+
+       rc = generate_key(ses, ptriplet->signing.label,
+                         ptriplet->signing.context, ses->smb3signingkey,
+                         SMB3_SIGN_KEY_SIZE);
+       if (rc)
+               return rc;
+
+       rc = generate_key(ses, ptriplet->encryption.label,
+                         ptriplet->encryption.context, ses->smb3encryptionkey,
+                         SMB3_SIGN_KEY_SIZE);
+       if (rc)
+               return rc;
+
+       return generate_key(ses, ptriplet->decryption.label,
+                           ptriplet->decryption.context,
+                           ses->smb3decryptionkey, SMB3_SIGN_KEY_SIZE);
+}
+
+int
+generate_smb30signingkey(struct cifs_ses *ses)
+
+{
+       struct derivation_triplet triplet;
+       struct derivation *d;
+
+       d = &triplet.signing;
+       d->label.iov_base = "SMB2AESCMAC";
+       d->label.iov_len = 12;
+       d->context.iov_base = "SmbSign";
+       d->context.iov_len = 8;
+
+       d = &triplet.encryption;
+       d->label.iov_base = "SMB2AESCCM";
+       d->label.iov_len = 11;
+       d->context.iov_base = "ServerIn ";
+       d->context.iov_len = 10;
+
+       d = &triplet.decryption;
+       d->label.iov_base = "SMB2AESCCM";
+       d->label.iov_len = 11;
+       d->context.iov_base = "ServerOut";
+       d->context.iov_len = 10;
+
+       return generate_smb3signingkey(ses, &triplet);
+}
+
+int
+generate_smb311signingkey(struct cifs_ses *ses)
+
+{
+       struct derivation_triplet triplet;
+       struct derivation *d;
+
+       d = &triplet.signing;
+       d->label.iov_base = "SMB2AESCMAC";
+       d->label.iov_len = 12;
+       d->context.iov_base = "SmbSign";
+       d->context.iov_len = 8;
+
+       d = &triplet.encryption;
+       d->label.iov_base = "SMB2AESCCM";
+       d->label.iov_len = 11;
+       d->context.iov_base = "ServerIn ";
+       d->context.iov_len = 10;
+
+       d = &triplet.decryption;
+       d->label.iov_base = "SMB2AESCCM";
+       d->label.iov_len = 11;
+       d->context.iov_base = "ServerOut";
+       d->context.iov_len = 10;
+
+       return generate_smb3signingkey(ses, &triplet);
+}
+
 int
 smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 {
index 2a24c524fb9a90cedd4187460ecb90c0e5dfcf0c..87abe8ed074c31962a969b178d55757e44bdc0d5 100644 (file)
@@ -576,14 +576,16 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
        cifs_in_send_dec(server);
        cifs_save_when_sent(mid);
 
-       if (rc < 0)
+       if (rc < 0) {
                server->sequence_number -= 2;
+               cifs_delete_mid(mid);
+       }
+
        mutex_unlock(&server->srv_mutex);
 
        if (rc == 0)
                return 0;
 
-       cifs_delete_mid(mid);
        add_credits_and_wake_if(server, credits, optype);
        return rc;
 }
index f829fe963f5bab9e1de4b81ec9195de90e5da77d..5104d84c4f6425c5ea950a889040a5ca54527c29 100644 (file)
@@ -72,8 +72,7 @@ void coda_sysctl_clean(void);
 } while (0)
 
 
-#define CODA_FREE(ptr,size) \
-    do { if (size < PAGE_SIZE) kfree((ptr)); else vfree((ptr)); } while (0)
+#define CODA_FREE(ptr, size) kvfree((ptr))
 
 /* inode to cnode access functions */
 
index fda9f4311212fe4a6d3b884d43dfdb7b5ff51e7e..42e731b8c80a680602d401b6b28239acf74291e0 100644 (file)
@@ -427,13 +427,13 @@ static int coda_readdir(struct file *coda_file, struct dir_context *ctx)
        if (host_file->f_op->iterate) {
                struct inode *host_inode = file_inode(host_file);
 
-               mutex_lock(&host_inode->i_mutex);
+               inode_lock(host_inode);
                ret = -ENOENT;
                if (!IS_DEADDIR(host_inode)) {
                        ret = host_file->f_op->iterate(host_file, ctx);
                        file_accessed(host_file);
                }
-               mutex_unlock(&host_inode->i_mutex);
+               inode_unlock(host_inode);
                return ret;
        }
        /* Venus: we must read Venus dirents from a file */
index 1da3805f3ddcdb209a6b357e12aacac35809dda9..f47c7483863b5ae55b1160de0f059ec23b3d4cb3 100644 (file)
@@ -71,12 +71,12 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to)
 
        host_file = cfi->cfi_container;
        file_start_write(host_file);
-       mutex_lock(&coda_inode->i_mutex);
+       inode_lock(coda_inode);
        ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos);
        coda_inode->i_size = file_inode(host_file)->i_size;
        coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9;
        coda_inode->i_mtime = coda_inode->i_ctime = CURRENT_TIME_SEC;
-       mutex_unlock(&coda_inode->i_mutex);
+       inode_unlock(coda_inode);
        file_end_write(host_file);
        return ret;
 }
@@ -203,7 +203,7 @@ int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync)
        err = filemap_write_and_wait_range(coda_inode->i_mapping, start, end);
        if (err)
                return err;
-       mutex_lock(&coda_inode->i_mutex);
+       inode_lock(coda_inode);
 
        cfi = CODA_FTOC(coda_file);
        BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
@@ -212,7 +212,7 @@ int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync)
        err = vfs_fsync(host_file, datasync);
        if (!err && !datasync)
                err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
-       mutex_unlock(&coda_inode->i_mutex);
+       inode_unlock(coda_inode);
 
        return err;
 }
index cab612b2ae767bbe2ffeb187a6f630f891cd52f3..f419519ec41fb4f3e932fe18561b34f3313e747b 100644 (file)
@@ -640,13 +640,13 @@ static void detach_groups(struct config_group *group)
 
                child = sd->s_dentry;
 
-               mutex_lock(&d_inode(child)->i_mutex);
+               inode_lock(d_inode(child));
 
                configfs_detach_group(sd->s_element);
                d_inode(child)->i_flags |= S_DEAD;
                dont_mount(child);
 
-               mutex_unlock(&d_inode(child)->i_mutex);
+               inode_unlock(d_inode(child));
 
                d_delete(child);
                dput(child);
@@ -834,11 +834,11 @@ static int configfs_attach_item(struct config_item *parent_item,
                         * the VFS may already have hit and used them. Thus,
                         * we must lock them as rmdir() would.
                         */
-                       mutex_lock(&d_inode(dentry)->i_mutex);
+                       inode_lock(d_inode(dentry));
                        configfs_remove_dir(item);
                        d_inode(dentry)->i_flags |= S_DEAD;
                        dont_mount(dentry);
-                       mutex_unlock(&d_inode(dentry)->i_mutex);
+                       inode_unlock(d_inode(dentry));
                        d_delete(dentry);
                }
        }
@@ -874,7 +874,7 @@ static int configfs_attach_group(struct config_item *parent_item,
                 * We must also lock the inode to remove it safely in case of
                 * error, as rmdir() would.
                 */
-               mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD);
+               inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD);
                configfs_adjust_dir_dirent_depth_before_populate(sd);
                ret = populate_groups(to_config_group(item));
                if (ret) {
@@ -883,7 +883,7 @@ static int configfs_attach_group(struct config_item *parent_item,
                        dont_mount(dentry);
                }
                configfs_adjust_dir_dirent_depth_after_populate(sd);
-               mutex_unlock(&d_inode(dentry)->i_mutex);
+               inode_unlock(d_inode(dentry));
                if (ret)
                        d_delete(dentry);
        }
@@ -1135,7 +1135,7 @@ int configfs_depend_item(struct configfs_subsystem *subsys,
         * subsystem is really registered, and so we need to lock out
         * configfs_[un]register_subsystem().
         */
-       mutex_lock(&d_inode(root)->i_mutex);
+       inode_lock(d_inode(root));
 
        subsys_sd = configfs_find_subsys_dentry(root->d_fsdata, s_item);
        if (!subsys_sd) {
@@ -1147,7 +1147,7 @@ int configfs_depend_item(struct configfs_subsystem *subsys,
        ret = configfs_do_depend_item(subsys_sd->s_dentry, target);
 
 out_unlock_fs:
-       mutex_unlock(&d_inode(root)->i_mutex);
+       inode_unlock(d_inode(root));
 
        /*
         * If we succeeded, the fs is pinned via other methods.  If not,
@@ -1230,7 +1230,7 @@ int configfs_depend_item_unlocked(struct configfs_subsystem *caller_subsys,
                 * additional locking to prevent other subsystem from being
                 * unregistered
                 */
-               mutex_lock(&d_inode(root->cg_item.ci_dentry)->i_mutex);
+               inode_lock(d_inode(root->cg_item.ci_dentry));
 
                /*
                 * As we are trying to depend item from other subsystem
@@ -1254,7 +1254,7 @@ out_root_unlock:
                 * We were called from subsystem other than our target so we
                 * took some locks so now it's time to release them
                 */
-               mutex_unlock(&d_inode(root->cg_item.ci_dentry)->i_mutex);
+               inode_unlock(d_inode(root->cg_item.ci_dentry));
 
        return ret;
 }
@@ -1561,7 +1561,7 @@ int configfs_rename_dir(struct config_item * item, const char *new_name)
        down_write(&configfs_rename_sem);
        parent = item->parent->dentry;
 
-       mutex_lock(&d_inode(parent)->i_mutex);
+       inode_lock(d_inode(parent));
 
        new_dentry = lookup_one_len(new_name, parent, strlen(new_name));
        if (!IS_ERR(new_dentry)) {
@@ -1577,7 +1577,7 @@ int configfs_rename_dir(struct config_item * item, const char *new_name)
                        error = -EEXIST;
                dput(new_dentry);
        }
-       mutex_unlock(&d_inode(parent)->i_mutex);
+       inode_unlock(d_inode(parent));
        up_write(&configfs_rename_sem);
 
        return error;
@@ -1590,7 +1590,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file)
        struct configfs_dirent * parent_sd = dentry->d_fsdata;
        int err;
 
-       mutex_lock(&d_inode(dentry)->i_mutex);
+       inode_lock(d_inode(dentry));
        /*
         * Fake invisibility if dir belongs to a group/default groups hierarchy
         * being attached
@@ -1603,7 +1603,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file)
                else
                        err = 0;
        }
-       mutex_unlock(&d_inode(dentry)->i_mutex);
+       inode_unlock(d_inode(dentry));
 
        return err;
 }
@@ -1613,11 +1613,11 @@ static int configfs_dir_close(struct inode *inode, struct file *file)
        struct dentry * dentry = file->f_path.dentry;
        struct configfs_dirent * cursor = file->private_data;
 
-       mutex_lock(&d_inode(dentry)->i_mutex);
+       inode_lock(d_inode(dentry));
        spin_lock(&configfs_dirent_lock);
        list_del_init(&cursor->s_sibling);
        spin_unlock(&configfs_dirent_lock);
-       mutex_unlock(&d_inode(dentry)->i_mutex);
+       inode_unlock(d_inode(dentry));
 
        release_configfs_dirent(cursor);
 
@@ -1698,7 +1698,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence)
 {
        struct dentry * dentry = file->f_path.dentry;
 
-       mutex_lock(&d_inode(dentry)->i_mutex);
+       inode_lock(d_inode(dentry));
        switch (whence) {
                case 1:
                        offset += file->f_pos;
@@ -1706,7 +1706,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence)
                        if (offset >= 0)
                                break;
                default:
-                       mutex_unlock(&d_inode(dentry)->i_mutex);
+                       inode_unlock(d_inode(dentry));
                        return -EINVAL;
        }
        if (offset != file->f_pos) {
@@ -1732,7 +1732,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence)
                        spin_unlock(&configfs_dirent_lock);
                }
        }
-       mutex_unlock(&d_inode(dentry)->i_mutex);
+       inode_unlock(d_inode(dentry));
        return offset;
 }
 
@@ -1767,14 +1767,14 @@ int configfs_register_group(struct config_group *parent_group,
 
        parent = parent_group->cg_item.ci_dentry;
 
-       mutex_lock_nested(&d_inode(parent)->i_mutex, I_MUTEX_PARENT);
+       inode_lock_nested(d_inode(parent), I_MUTEX_PARENT);
        ret = create_default_group(parent_group, group);
        if (!ret) {
                spin_lock(&configfs_dirent_lock);
                configfs_dir_set_ready(group->cg_item.ci_dentry->d_fsdata);
                spin_unlock(&configfs_dirent_lock);
        }
-       mutex_unlock(&d_inode(parent)->i_mutex);
+       inode_unlock(d_inode(parent));
        return ret;
 }
 EXPORT_SYMBOL(configfs_register_group);
@@ -1791,7 +1791,7 @@ void configfs_unregister_group(struct config_group *group)
        struct dentry *dentry = group->cg_item.ci_dentry;
        struct dentry *parent = group->cg_item.ci_parent->ci_dentry;
 
-       mutex_lock_nested(&d_inode(parent)->i_mutex, I_MUTEX_PARENT);
+       inode_lock_nested(d_inode(parent), I_MUTEX_PARENT);
        spin_lock(&configfs_dirent_lock);
        configfs_detach_prep(dentry, NULL);
        spin_unlock(&configfs_dirent_lock);
@@ -1800,7 +1800,7 @@ void configfs_unregister_group(struct config_group *group)
        d_inode(dentry)->i_flags |= S_DEAD;
        dont_mount(dentry);
        d_delete(dentry);
-       mutex_unlock(&d_inode(parent)->i_mutex);
+       inode_unlock(d_inode(parent));
 
        dput(dentry);
 
@@ -1872,7 +1872,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
        sd = root->d_fsdata;
        link_group(to_config_group(sd->s_element), group);
 
-       mutex_lock_nested(&d_inode(root)->i_mutex, I_MUTEX_PARENT);
+       inode_lock_nested(d_inode(root), I_MUTEX_PARENT);
 
        err = -ENOMEM;
        dentry = d_alloc_name(root, group->cg_item.ci_name);
@@ -1892,7 +1892,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
                }
        }
 
-       mutex_unlock(&d_inode(root)->i_mutex);
+       inode_unlock(d_inode(root));
 
        if (err) {
                unlink_group(group);
@@ -1913,9 +1913,9 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
                return;
        }
 
-       mutex_lock_nested(&d_inode(root)->i_mutex,
+       inode_lock_nested(d_inode(root),
                          I_MUTEX_PARENT);
-       mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD);
+       inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD);
        mutex_lock(&configfs_symlink_mutex);
        spin_lock(&configfs_dirent_lock);
        if (configfs_detach_prep(dentry, NULL)) {
@@ -1926,11 +1926,11 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
        configfs_detach_group(&group->cg_item);
        d_inode(dentry)->i_flags |= S_DEAD;
        dont_mount(dentry);
-       mutex_unlock(&d_inode(dentry)->i_mutex);
+       inode_unlock(d_inode(dentry));
 
        d_delete(dentry);
 
-       mutex_unlock(&d_inode(root)->i_mutex);
+       inode_unlock(d_inode(root));
 
        dput(dentry);
 
index 3687187c8ea59e8a6a2795c8e36fa95e897955a7..33b7ee34eda5f135fef480f0bdf55bb4037ab334 100644 (file)
@@ -540,10 +540,10 @@ int configfs_create_file(struct config_item * item, const struct configfs_attrib
        umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG;
        int error = 0;
 
-       mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_NORMAL);
+       inode_lock_nested(d_inode(dir), I_MUTEX_NORMAL);
        error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode,
                                     CONFIGFS_ITEM_ATTR);
-       mutex_unlock(&d_inode(dir)->i_mutex);
+       inode_unlock(d_inode(dir));
 
        return error;
 }
@@ -562,10 +562,10 @@ int configfs_create_bin_file(struct config_item *item,
        umode_t mode = (bin_attr->cb_attr.ca_mode & S_IALLUGO) | S_IFREG;
        int error = 0;
 
-       mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_NORMAL);
+       inode_lock_nested(dir->d_inode, I_MUTEX_NORMAL);
        error = configfs_make_dirent(parent_sd, NULL, (void *) bin_attr, mode,
                                     CONFIGFS_ITEM_BIN_ATTR);
-       mutex_unlock(&dir->d_inode->i_mutex);
+       inode_unlock(dir->d_inode);
 
        return error;
 }
index 0cc810e9dccc15fb764d53a4cb26604b3d16672f..cee087d8f7e02f5b62198a6091a5a7f23cf0109f 100644 (file)
@@ -255,7 +255,7 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name)
                /* no inode means this hasn't been made visible yet */
                return;
 
-       mutex_lock(&d_inode(dir)->i_mutex);
+       inode_lock(d_inode(dir));
        list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
                if (!sd->s_element)
                        continue;
@@ -268,5 +268,5 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name)
                        break;
                }
        }
-       mutex_unlock(&d_inode(dir)->i_mutex);
+       inode_unlock(d_inode(dir));
 }
index 7af8797590640e032c74f619b9fd1931fed77e15..4fd6b0c5c6b50d29097e016bcd205ab72b758225 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -24,6 +24,7 @@
 #include <linux/memcontrol.h>
 #include <linux/mm.h>
 #include <linux/mutex.h>
+#include <linux/pagevec.h>
 #include <linux/pmem.h>
 #include <linux/sched.h>
 #include <linux/uio.h>
@@ -245,13 +246,14 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
        loff_t end = pos + iov_iter_count(iter);
 
        memset(&bh, 0, sizeof(bh));
+       bh.b_bdev = inode->i_sb->s_bdev;
 
        if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) {
                struct address_space *mapping = inode->i_mapping;
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
                retval = filemap_write_and_wait_range(mapping, pos, end - 1);
                if (retval) {
-                       mutex_unlock(&inode->i_mutex);
+                       inode_unlock(inode);
                        goto out;
                }
        }
@@ -263,7 +265,7 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
        retval = dax_io(inode, iter, pos, end, get_block, &bh);
 
        if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
 
        if ((retval > 0) && end_io)
                end_io(iocb, pos, retval, bh.b_private);
@@ -324,6 +326,199 @@ static int copy_user_bh(struct page *to, struct inode *inode,
        return 0;
 }
 
+#define NO_SECTOR -1
+#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_CACHE_SHIFT))
+
+static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
+               sector_t sector, bool pmd_entry, bool dirty)
+{
+       struct radix_tree_root *page_tree = &mapping->page_tree;
+       pgoff_t pmd_index = DAX_PMD_INDEX(index);
+       int type, error = 0;
+       void *entry;
+
+       WARN_ON_ONCE(pmd_entry && !dirty);
+       __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+
+       spin_lock_irq(&mapping->tree_lock);
+
+       entry = radix_tree_lookup(page_tree, pmd_index);
+       if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) {
+               index = pmd_index;
+               goto dirty;
+       }
+
+       entry = radix_tree_lookup(page_tree, index);
+       if (entry) {
+               type = RADIX_DAX_TYPE(entry);
+               if (WARN_ON_ONCE(type != RADIX_DAX_PTE &&
+                                       type != RADIX_DAX_PMD)) {
+                       error = -EIO;
+                       goto unlock;
+               }
+
+               if (!pmd_entry || type == RADIX_DAX_PMD)
+                       goto dirty;
+
+               /*
+                * We only insert dirty PMD entries into the radix tree.  This
+                * means we don't need to worry about removing a dirty PTE
+                * entry and inserting a clean PMD entry, thus reducing the
+                * range we would flush with a follow-up fsync/msync call.
+                */
+               radix_tree_delete(&mapping->page_tree, index);
+               mapping->nrexceptional--;
+       }
+
+       if (sector == NO_SECTOR) {
+               /*
+                * This can happen during correct operation if our pfn_mkwrite
+                * fault raced against a hole punch operation.  If this
+                * happens the pte that was hole punched will have been
+                * unmapped and the radix tree entry will have been removed by
+                * the time we are called, but the call will still happen.  We
+                * will return all the way up to wp_pfn_shared(), where the
+                * pte_same() check will fail, eventually causing page fault
+                * to be retried by the CPU.
+                */
+               goto unlock;
+       }
+
+       error = radix_tree_insert(page_tree, index,
+                       RADIX_DAX_ENTRY(sector, pmd_entry));
+       if (error)
+               goto unlock;
+
+       mapping->nrexceptional++;
+ dirty:
+       if (dirty)
+               radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
+ unlock:
+       spin_unlock_irq(&mapping->tree_lock);
+       return error;
+}
+
+static int dax_writeback_one(struct block_device *bdev,
+               struct address_space *mapping, pgoff_t index, void *entry)
+{
+       struct radix_tree_root *page_tree = &mapping->page_tree;
+       int type = RADIX_DAX_TYPE(entry);
+       struct radix_tree_node *node;
+       struct blk_dax_ctl dax;
+       void **slot;
+       int ret = 0;
+
+       spin_lock_irq(&mapping->tree_lock);
+       /*
+        * Regular page slots are stabilized by the page lock even
+        * without the tree itself locked.  These unlocked entries
+        * need verification under the tree lock.
+        */
+       if (!__radix_tree_lookup(page_tree, index, &node, &slot))
+               goto unlock;
+       if (*slot != entry)
+               goto unlock;
+
+       /* another fsync thread may have already written back this entry */
+       if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
+               goto unlock;
+
+       if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) {
+               ret = -EIO;
+               goto unlock;
+       }
+
+       dax.sector = RADIX_DAX_SECTOR(entry);
+       dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);
+       spin_unlock_irq(&mapping->tree_lock);
+
+       /*
+        * We cannot hold tree_lock while calling dax_map_atomic() because it
+        * eventually calls cond_resched().
+        */
+       ret = dax_map_atomic(bdev, &dax);
+       if (ret < 0)
+               return ret;
+
+       if (WARN_ON_ONCE(ret < dax.size)) {
+               ret = -EIO;
+               goto unmap;
+       }
+
+       wb_cache_pmem(dax.addr, dax.size);
+
+       spin_lock_irq(&mapping->tree_lock);
+       radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
+       spin_unlock_irq(&mapping->tree_lock);
+ unmap:
+       dax_unmap_atomic(bdev, &dax);
+       return ret;
+
+ unlock:
+       spin_unlock_irq(&mapping->tree_lock);
+       return ret;
+}
+
+/*
+ * Flush the mapping to the persistent domain within the byte range of [start,
+ * end]. This is required by data integrity operations to ensure file data is
+ * on persistent storage prior to completion of the operation.
+ */
+int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
+               loff_t end)
+{
+       struct inode *inode = mapping->host;
+       struct block_device *bdev = inode->i_sb->s_bdev;
+       pgoff_t start_index, end_index, pmd_index;
+       pgoff_t indices[PAGEVEC_SIZE];
+       struct pagevec pvec;
+       bool done = false;
+       int i, ret = 0;
+       void *entry;
+
+       if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
+               return -EIO;
+
+       start_index = start >> PAGE_CACHE_SHIFT;
+       end_index = end >> PAGE_CACHE_SHIFT;
+       pmd_index = DAX_PMD_INDEX(start_index);
+
+       rcu_read_lock();
+       entry = radix_tree_lookup(&mapping->page_tree, pmd_index);
+       rcu_read_unlock();
+
+       /* see if the start of our range is covered by a PMD entry */
+       if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
+               start_index = pmd_index;
+
+       tag_pages_for_writeback(mapping, start_index, end_index);
+
+       pagevec_init(&pvec, 0);
+       while (!done) {
+               pvec.nr = find_get_entries_tag(mapping, start_index,
+                               PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
+                               pvec.pages, indices);
+
+               if (pvec.nr == 0)
+                       break;
+
+               for (i = 0; i < pvec.nr; i++) {
+                       if (indices[i] > end_index) {
+                               done = true;
+                               break;
+                       }
+
+                       ret = dax_writeback_one(bdev, mapping, indices[i],
+                                       pvec.pages[i]);
+                       if (ret < 0)
+                               return ret;
+               }
+       }
+       wmb_pmem();
+       return 0;
+}
+EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
+
 static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
                        struct vm_area_struct *vma, struct vm_fault *vmf)
 {
@@ -363,6 +558,11 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
        }
        dax_unmap_atomic(bdev, &dax);
 
+       error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
+                       vmf->flags & FAULT_FLAG_WRITE);
+       if (error)
+               goto out;
+
        error = vm_insert_mixed(vma, vaddr, dax.pfn);
 
  out:
@@ -408,6 +608,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 
        memset(&bh, 0, sizeof(bh));
        block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
+       bh.b_bdev = inode->i_sb->s_bdev;
        bh.b_size = PAGE_SIZE;
 
  repeat:
@@ -487,6 +688,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
                delete_from_page_cache(page);
                unlock_page(page);
                page_cache_release(page);
+               page = NULL;
        }
 
        /*
@@ -590,7 +792,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
        struct block_device *bdev;
        pgoff_t size, pgoff;
        sector_t block;
-       int result = 0;
+       int error, result = 0;
+       bool alloc = false;
 
        /* dax pmd mappings require pfn_t_devmap() */
        if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
@@ -624,13 +827,21 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
        }
 
        memset(&bh, 0, sizeof(bh));
+       bh.b_bdev = inode->i_sb->s_bdev;
        block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
 
        bh.b_size = PMD_SIZE;
-       if (get_block(inode, block, &bh, write) != 0)
+
+       if (get_block(inode, block, &bh, 0) != 0)
                return VM_FAULT_SIGBUS;
+
+       if (!buffer_mapped(&bh) && write) {
+               if (get_block(inode, block, &bh, 1) != 0)
+                       return VM_FAULT_SIGBUS;
+               alloc = true;
+       }
+
        bdev = bh.b_bdev;
-       i_mmap_lock_read(mapping);
 
        /*
         * If the filesystem isn't willing to tell us the length of a hole,
@@ -639,19 +850,22 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
         */
        if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) {
                dax_pmd_dbg(&bh, address, "allocated block too small");
-               goto fallback;
+               return VM_FAULT_FALLBACK;
        }
 
        /*
         * If we allocated new storage, make sure no process has any
         * zero pages covering this hole
         */
-       if (buffer_new(&bh)) {
-               i_mmap_unlock_read(mapping);
-               unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0);
-               i_mmap_lock_read(mapping);
+       if (alloc) {
+               loff_t lstart = pgoff << PAGE_SHIFT;
+               loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */
+
+               truncate_pagecache_range(inode, lstart, lend);
        }
 
+       i_mmap_lock_read(mapping);
+
        /*
         * If a truncate happened while we were allocating blocks, we may
         * leave blocks allocated to the file that are beyond EOF.  We can't
@@ -664,7 +878,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
                goto out;
        }
        if ((pgoff | PG_PMD_COLOUR) >= size) {
-               dax_pmd_dbg(&bh, address, "pgoff unaligned");
+               dax_pmd_dbg(&bh, address,
+                               "offset + huge page size > file size");
                goto fallback;
        }
 
@@ -732,6 +947,31 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
                }
                dax_unmap_atomic(bdev, &dax);
 
+               /*
+                * For PTE faults we insert a radix tree entry for reads, and
+                * leave it clean.  Then on the first write we dirty the radix
+                * tree entry via the dax_pfn_mkwrite() path.  This sequence
+                * allows the dax_pfn_mkwrite() call to be simpler and avoid a
+                * call into get_block() to translate the pgoff to a sector in
+                * order to be able to create a new radix tree entry.
+                *
+                * The PMD path doesn't have an equivalent to
+                * dax_pfn_mkwrite(), though, so for a read followed by a
+                * write we traverse all the way through __dax_pmd_fault()
+                * twice.  This means we can just skip inserting a radix tree
+                * entry completely on the initial read and just wait until
+                * the write to insert a dirty entry.
+                */
+               if (write) {
+                       error = dax_radix_entry(mapping, pgoff, dax.sector,
+                                       true, true);
+                       if (error) {
+                               dax_pmd_dbg(&bh, address,
+                                               "PMD radix insertion failed");
+                               goto fallback;
+                       }
+               }
+
                dev_dbg(part_to_dev(bdev->bd_part),
                                "%s: %s addr: %lx pfn: %lx sect: %llx\n",
                                __func__, current->comm, address,
@@ -790,15 +1030,20 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault);
  * dax_pfn_mkwrite - handle first write to DAX page
  * @vma: The virtual memory area where the fault occurred
  * @vmf: The description of the fault
- *
  */
 int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-       struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+       struct file *file = vma->vm_file;
 
-       sb_start_pagefault(sb);
-       file_update_time(vma->vm_file);
-       sb_end_pagefault(sb);
+       /*
+        * We pass NO_SECTOR to dax_radix_entry() because we expect that a
+        * RADIX_DAX_PTE entry already exists in the radix tree from a
+        * previous call to __dax_fault().  We just want to look up that PTE
+        * entry using vmf->pgoff and make sure the dirty tag is set.  This
+        * saves us from having to make a call to get_block() here to look
+        * up the sector.
+        */
+       dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, true);
        return VM_FAULT_NOPAGE;
 }
 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
@@ -835,6 +1080,7 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
        BUG_ON((offset + length) > PAGE_CACHE_SIZE);
 
        memset(&bh, 0, sizeof(bh));
+       bh.b_bdev = inode->i_sb->s_bdev;
        bh.b_size = PAGE_CACHE_SIZE;
        err = get_block(inode, index, &bh, 0);
        if (err < 0)
index b4539e84e5779a48c470804f9051ca5f3142eb48..92d5140de8516c642b6ea5af81f0ffe89399a416 100644 (file)
@@ -2462,7 +2462,7 @@ EXPORT_SYMBOL(d_rehash);
  */
 void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
 {
-       BUG_ON(!mutex_is_locked(&dentry->d_parent->d_inode->i_mutex));
+       BUG_ON(!inode_is_locked(dentry->d_parent->d_inode));
        BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */
 
        spin_lock(&dentry->d_lock);
@@ -2738,7 +2738,7 @@ static int __d_unalias(struct inode *inode,
        if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex))
                goto out_err;
        m1 = &dentry->d_sb->s_vfs_rename_mutex;
-       if (!mutex_trylock(&alias->d_parent->d_inode->i_mutex))
+       if (!inode_trylock(alias->d_parent->d_inode))
                goto out_err;
        m2 = &alias->d_parent->d_inode->i_mutex;
 out_unalias:
index b7fcc0de0b2f2771aff08a130eb20564adb3cbd4..bece948b363df5b8d331252cd9468d1bfebead68 100644 (file)
@@ -265,7 +265,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
        if (!parent)
                parent = debugfs_mount->mnt_root;
 
-       mutex_lock(&d_inode(parent)->i_mutex);
+       inode_lock(d_inode(parent));
        dentry = lookup_one_len(name, parent, strlen(name));
        if (!IS_ERR(dentry) && d_really_is_positive(dentry)) {
                dput(dentry);
@@ -273,7 +273,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
        }
 
        if (IS_ERR(dentry)) {
-               mutex_unlock(&d_inode(parent)->i_mutex);
+               inode_unlock(d_inode(parent));
                simple_release_fs(&debugfs_mount, &debugfs_mount_count);
        }
 
@@ -282,7 +282,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
 
 static struct dentry *failed_creating(struct dentry *dentry)
 {
-       mutex_unlock(&d_inode(dentry->d_parent)->i_mutex);
+       inode_unlock(d_inode(dentry->d_parent));
        dput(dentry);
        simple_release_fs(&debugfs_mount, &debugfs_mount_count);
        return NULL;
@@ -290,7 +290,7 @@ static struct dentry *failed_creating(struct dentry *dentry)
 
 static struct dentry *end_creating(struct dentry *dentry)
 {
-       mutex_unlock(&d_inode(dentry->d_parent)->i_mutex);
+       inode_unlock(d_inode(dentry->d_parent));
        return dentry;
 }
 
@@ -560,9 +560,9 @@ void debugfs_remove(struct dentry *dentry)
        if (!parent || d_really_is_negative(parent))
                return;
 
-       mutex_lock(&d_inode(parent)->i_mutex);
+       inode_lock(d_inode(parent));
        ret = __debugfs_remove(dentry, parent);
-       mutex_unlock(&d_inode(parent)->i_mutex);
+       inode_unlock(d_inode(parent));
        if (!ret)
                simple_release_fs(&debugfs_mount, &debugfs_mount_count);
 }
@@ -594,7 +594,7 @@ void debugfs_remove_recursive(struct dentry *dentry)
 
        parent = dentry;
  down:
-       mutex_lock(&d_inode(parent)->i_mutex);
+       inode_lock(d_inode(parent));
  loop:
        /*
         * The parent->d_subdirs is protected by the d_lock. Outside that
@@ -609,7 +609,7 @@ void debugfs_remove_recursive(struct dentry *dentry)
                /* perhaps simple_empty(child) makes more sense */
                if (!list_empty(&child->d_subdirs)) {
                        spin_unlock(&parent->d_lock);
-                       mutex_unlock(&d_inode(parent)->i_mutex);
+                       inode_unlock(d_inode(parent));
                        parent = child;
                        goto down;
                }
@@ -630,10 +630,10 @@ void debugfs_remove_recursive(struct dentry *dentry)
        }
        spin_unlock(&parent->d_lock);
 
-       mutex_unlock(&d_inode(parent)->i_mutex);
+       inode_unlock(d_inode(parent));
        child = parent;
        parent = parent->d_parent;
-       mutex_lock(&d_inode(parent)->i_mutex);
+       inode_lock(d_inode(parent));
 
        if (child != dentry)
                /* go up */
@@ -641,7 +641,7 @@ void debugfs_remove_recursive(struct dentry *dentry)
 
        if (!__debugfs_remove(child, parent))
                simple_release_fs(&debugfs_mount, &debugfs_mount_count);
-       mutex_unlock(&d_inode(parent)->i_mutex);
+       inode_unlock(d_inode(parent));
 }
 EXPORT_SYMBOL_GPL(debugfs_remove_recursive);
 
index c35ffdc12bbafd1874dcea43ec5094867daf3e0d..1f107fd513286f0f3e8be601cd715ea0ee5443e8 100644 (file)
@@ -255,7 +255,7 @@ static int mknod_ptmx(struct super_block *sb)
        if (!uid_valid(root_uid) || !gid_valid(root_gid))
                return -EINVAL;
 
-       mutex_lock(&d_inode(root)->i_mutex);
+       inode_lock(d_inode(root));
 
        /* If we have already created ptmx node, return */
        if (fsi->ptmx_dentry) {
@@ -292,7 +292,7 @@ static int mknod_ptmx(struct super_block *sb)
        fsi->ptmx_dentry = dentry;
        rc = 0;
 out:
-       mutex_unlock(&d_inode(root)->i_mutex);
+       inode_unlock(d_inode(root));
        return rc;
 }
 
@@ -615,7 +615,7 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index,
 
        sprintf(s, "%d", index);
 
-       mutex_lock(&d_inode(root)->i_mutex);
+       inode_lock(d_inode(root));
 
        dentry = d_alloc_name(root, s);
        if (dentry) {
@@ -626,7 +626,7 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index,
                inode = ERR_PTR(-ENOMEM);
        }
 
-       mutex_unlock(&d_inode(root)->i_mutex);
+       inode_unlock(d_inode(root));
 
        return inode;
 }
@@ -671,7 +671,7 @@ void devpts_pty_kill(struct inode *inode)
 
        BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
 
-       mutex_lock(&d_inode(root)->i_mutex);
+       inode_lock(d_inode(root));
 
        dentry = d_find_alias(inode);
 
@@ -680,7 +680,7 @@ void devpts_pty_kill(struct inode *inode)
        dput(dentry);   /* d_alloc_name() in devpts_pty_new() */
        dput(dentry);           /* d_find_alias above */
 
-       mutex_unlock(&d_inode(root)->i_mutex);
+       inode_unlock(d_inode(root));
 }
 
 static int __init init_devpts_fs(void)
index 602e8441bc0fb6b094ec96dfb3273ff73b8b1506..1b2f7ffc8b841fd16cf312874fe8c7d4c0fa0e8e 100644 (file)
@@ -1157,12 +1157,12 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
                                        iocb->ki_filp->f_mapping;
 
                        /* will be released by direct_io_worker */
-                       mutex_lock(&inode->i_mutex);
+                       inode_lock(inode);
 
                        retval = filemap_write_and_wait_range(mapping, offset,
                                                              end - 1);
                        if (retval) {
-                               mutex_unlock(&inode->i_mutex);
+                               inode_unlock(inode);
                                kmem_cache_free(dio_cache, dio);
                                goto out;
                        }
@@ -1173,7 +1173,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
        dio->i_size = i_size_read(inode);
        if (iov_iter_rw(iter) == READ && offset >= dio->i_size) {
                if (dio->flags & DIO_LOCKING)
-                       mutex_unlock(&inode->i_mutex);
+                       inode_unlock(inode);
                kmem_cache_free(dio_cache, dio);
                retval = 0;
                goto out;
@@ -1295,7 +1295,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
         * of protecting us from looking up uninitialized blocks.
         */
        if (iov_iter_rw(iter) == READ && (dio->flags & DIO_LOCKING))
-               mutex_unlock(&dio->inode->i_mutex);
+               inode_unlock(dio->inode);
 
        /*
         * The only time we want to leave bios in flight is when a successful
index 1925d6d222b87f635b9925e05003a3c7c41d7580..58c2f4a21b7f4233b34b70efb0602b297d7f9012 100644 (file)
@@ -516,7 +516,7 @@ static ssize_t device_write(struct file *file, const char __user *buf,
                return -EINVAL;
 
        kbuf = memdup_user_nul(buf, count);
-       if (!IS_ERR(kbuf))
+       if (IS_ERR(kbuf))
                return PTR_ERR(kbuf);
 
        if (check_version(kbuf)) {
index 040aa879d634fe15d38f0ce9bf103c7b514122b4..4e685ac1024dc313e56ed8c8245fa5add9bb7c33 100644 (file)
@@ -41,13 +41,13 @@ static struct dentry *lock_parent(struct dentry *dentry)
        struct dentry *dir;
 
        dir = dget_parent(dentry);
-       mutex_lock_nested(&(d_inode(dir)->i_mutex), I_MUTEX_PARENT);
+       inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
        return dir;
 }
 
 static void unlock_dir(struct dentry *dir)
 {
-       mutex_unlock(&d_inode(dir)->i_mutex);
+       inode_unlock(d_inode(dir));
        dput(dir);
 }
 
@@ -397,11 +397,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
        int rc = 0;
 
        lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
-       mutex_lock(&d_inode(lower_dir_dentry)->i_mutex);
+       inode_lock(d_inode(lower_dir_dentry));
        lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
                                      lower_dir_dentry,
                                      ecryptfs_dentry->d_name.len);
-       mutex_unlock(&d_inode(lower_dir_dentry)->i_mutex);
+       inode_unlock(d_inode(lower_dir_dentry));
        if (IS_ERR(lower_dentry)) {
                rc = PTR_ERR(lower_dentry);
                ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
@@ -426,11 +426,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
                       "filename; rc = [%d]\n", __func__, rc);
                goto out;
        }
-       mutex_lock(&d_inode(lower_dir_dentry)->i_mutex);
+       inode_lock(d_inode(lower_dir_dentry));
        lower_dentry = lookup_one_len(encrypted_and_encoded_name,
                                      lower_dir_dentry,
                                      encrypted_and_encoded_name_size);
-       mutex_unlock(&d_inode(lower_dir_dentry)->i_mutex);
+       inode_unlock(d_inode(lower_dir_dentry));
        if (IS_ERR(lower_dentry)) {
                rc = PTR_ERR(lower_dentry);
                ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
@@ -869,9 +869,9 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
        if (!rc && lower_ia.ia_valid & ATTR_SIZE) {
                struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
 
-               mutex_lock(&d_inode(lower_dentry)->i_mutex);
+               inode_lock(d_inode(lower_dentry));
                rc = notify_change(lower_dentry, &lower_ia, NULL);
-               mutex_unlock(&d_inode(lower_dentry)->i_mutex);
+               inode_unlock(d_inode(lower_dentry));
        }
        return rc;
 }
@@ -970,9 +970,9 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
        if (lower_ia.ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID))
                lower_ia.ia_valid &= ~ATTR_MODE;
 
-       mutex_lock(&d_inode(lower_dentry)->i_mutex);
+       inode_lock(d_inode(lower_dentry));
        rc = notify_change(lower_dentry, &lower_ia, NULL);
-       mutex_unlock(&d_inode(lower_dentry)->i_mutex);
+       inode_unlock(d_inode(lower_dentry));
 out:
        fsstack_copy_attr_all(inode, lower_inode);
        return rc;
@@ -1048,10 +1048,10 @@ ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name,
                rc = -EOPNOTSUPP;
                goto out;
        }
-       mutex_lock(&d_inode(lower_dentry)->i_mutex);
+       inode_lock(d_inode(lower_dentry));
        rc = d_inode(lower_dentry)->i_op->getxattr(lower_dentry, name, value,
                                                   size);
-       mutex_unlock(&d_inode(lower_dentry)->i_mutex);
+       inode_unlock(d_inode(lower_dentry));
 out:
        return rc;
 }
@@ -1075,9 +1075,9 @@ ecryptfs_listxattr(struct dentry *dentry, char *list, size_t size)
                rc = -EOPNOTSUPP;
                goto out;
        }
-       mutex_lock(&d_inode(lower_dentry)->i_mutex);
+       inode_lock(d_inode(lower_dentry));
        rc = d_inode(lower_dentry)->i_op->listxattr(lower_dentry, list, size);
-       mutex_unlock(&d_inode(lower_dentry)->i_mutex);
+       inode_unlock(d_inode(lower_dentry));
 out:
        return rc;
 }
@@ -1092,9 +1092,9 @@ static int ecryptfs_removexattr(struct dentry *dentry, const char *name)
                rc = -EOPNOTSUPP;
                goto out;
        }
-       mutex_lock(&d_inode(lower_dentry)->i_mutex);
+       inode_lock(d_inode(lower_dentry));
        rc = d_inode(lower_dentry)->i_op->removexattr(lower_dentry, name);
-       mutex_unlock(&d_inode(lower_dentry)->i_mutex);
+       inode_unlock(d_inode(lower_dentry));
 out:
        return rc;
 }
index caba848ac76335ef0f897cb7cb27c1bbf520c1de..c6ced4cbf0cff7d3b3f754ddbf0727d44bca8809 100644 (file)
@@ -436,7 +436,7 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode)
                rc = -ENOMEM;
                goto out;
        }
-       mutex_lock(&lower_inode->i_mutex);
+       inode_lock(lower_inode);
        size = lower_inode->i_op->getxattr(lower_dentry, ECRYPTFS_XATTR_NAME,
                                           xattr_virt, PAGE_CACHE_SIZE);
        if (size < 0)
@@ -444,7 +444,7 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode)
        put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt);
        rc = lower_inode->i_op->setxattr(lower_dentry, ECRYPTFS_XATTR_NAME,
                                         xattr_virt, size, 0);
-       mutex_unlock(&lower_inode->i_mutex);
+       inode_unlock(lower_inode);
        if (rc)
                printk(KERN_ERR "Error whilst attempting to write inode size "
                       "to lower file xattr; rc = [%d]\n", rc);
index 90001da9abfdfe98b01a0eacdb5381425234b126..c424e4813ec8019b5f8a62feb8eb51c6d13f7f97 100644 (file)
@@ -50,9 +50,9 @@ static ssize_t efivarfs_file_write(struct file *file,
                d_delete(file->f_path.dentry);
                dput(file->f_path.dentry);
        } else {
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
                i_size_write(inode, datasize + sizeof(attributes));
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
        }
 
        bytes = count;
index 86a2121828c312e53d64aedee9506319bbadf6c1..b8a564f29107b6d38e90b54d99c86f4aa4b96532 100644 (file)
@@ -160,10 +160,10 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
        efivar_entry_size(entry, &size);
        efivar_entry_add(entry, &efivarfs_list);
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        inode->i_private = entry;
        i_size_write(inode, size + sizeof(entry->var.Attributes));
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        d_add(dentry, inode);
 
        return 0;
index 828ec5f07de00b7a21d016c022afc68ee0962331..dcd4ac7d3f1e77b45fde8a84585150a4862b84d9 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1307,13 +1307,13 @@ static void bprm_fill_uid(struct linux_binprm *bprm)
                return;
 
        /* Be careful if suid/sgid is set */
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        /* reload atomically mode/uid/gid now that lock held */
        mode = inode->i_mode;
        uid = inode->i_uid;
        gid = inode->i_gid;
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        /* We ignore suid/sgid if there are no mappings for them in the ns */
        if (!kuid_has_mapping(bprm->cred->user_ns, uid) ||
index 906de66e8e7e067e6179aa3408c6e6aa8ea528b7..28645f0640f735b7f168b14258cc22c110bc0f1d 100644 (file)
@@ -52,9 +52,9 @@ static int exofs_file_fsync(struct file *filp, loff_t start, loff_t end,
        if (ret)
                return ret;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        ret = sync_inode_metadata(filp->f_mapping->host, 1);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return ret;
 }
 
index 714cd37a6ba30fd970b8384a2c2b26fc5209351f..c46f1a190b8d9ecedd8157530ecfc3ec9ccf16c4 100644 (file)
@@ -124,10 +124,10 @@ static struct dentry *reconnect_one(struct vfsmount *mnt,
        int err;
 
        parent = ERR_PTR(-EACCES);
-       mutex_lock(&dentry->d_inode->i_mutex);
+       inode_lock(dentry->d_inode);
        if (mnt->mnt_sb->s_export_op->get_parent)
                parent = mnt->mnt_sb->s_export_op->get_parent(dentry);
-       mutex_unlock(&dentry->d_inode->i_mutex);
+       inode_unlock(dentry->d_inode);
 
        if (IS_ERR(parent)) {
                dprintk("%s: get_parent of %ld failed, err %d\n",
@@ -143,9 +143,9 @@ static struct dentry *reconnect_one(struct vfsmount *mnt,
        if (err)
                goto out_err;
        dprintk("%s: found name: %s\n", __func__, nbuf);
-       mutex_lock(&parent->d_inode->i_mutex);
+       inode_lock(parent->d_inode);
        tmp = lookup_one_len(nbuf, parent, strlen(nbuf));
-       mutex_unlock(&parent->d_inode->i_mutex);
+       inode_unlock(parent->d_inode);
        if (IS_ERR(tmp)) {
                dprintk("%s: lookup failed: %d\n", __func__, PTR_ERR(tmp));
                goto out_err;
@@ -503,10 +503,10 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
                 */
                err = exportfs_get_name(mnt, target_dir, nbuf, result);
                if (!err) {
-                       mutex_lock(&target_dir->d_inode->i_mutex);
+                       inode_lock(target_dir->d_inode);
                        nresult = lookup_one_len(nbuf, target_dir,
                                                 strlen(nbuf));
-                       mutex_unlock(&target_dir->d_inode->i_mutex);
+                       inode_unlock(target_dir->d_inode);
                        if (!IS_ERR(nresult)) {
                                if (nresult->d_inode) {
                                        dput(result);
index 11a42c5a09aee553bd85db0154be89dbd1b374cc..2c88d683cd918d673c83390aea187e763b5aa78e 100644 (file)
@@ -102,8 +102,8 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
 {
        struct inode *inode = file_inode(vma->vm_file);
        struct ext2_inode_info *ei = EXT2_I(inode);
-       int ret = VM_FAULT_NOPAGE;
        loff_t size;
+       int ret;
 
        sb_start_pagefault(inode->i_sb);
        file_update_time(vma->vm_file);
@@ -113,6 +113,8 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
        size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
        if (vmf->pgoff >= size)
                ret = VM_FAULT_SIGBUS;
+       else
+               ret = dax_pfn_mkwrite(vma, vmf);
 
        up_read(&ei->dax_sem);
        sb_end_pagefault(inode->i_sb);
index 5d46c09863f09408258696171dfbec2b72eab994..b386af2e45f41fda4f965eafa5751ab10dc48bc3 100644 (file)
@@ -51,10 +51,10 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 
                flags = ext2_mask_flags(inode->i_mode, flags);
 
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
                /* Is it quota file? Do not allow user to mess with it */
                if (IS_NOQUOTA(inode)) {
-                       mutex_unlock(&inode->i_mutex);
+                       inode_unlock(inode);
                        ret = -EPERM;
                        goto setflags_out;
                }
@@ -68,7 +68,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                 */
                if ((flags ^ oldflags) & (EXT2_APPEND_FL | EXT2_IMMUTABLE_FL)) {
                        if (!capable(CAP_LINUX_IMMUTABLE)) {
-                               mutex_unlock(&inode->i_mutex);
+                               inode_unlock(inode);
                                ret = -EPERM;
                                goto setflags_out;
                        }
@@ -80,7 +80,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 
                ext2_set_inode_flags(inode);
                inode->i_ctime = CURRENT_TIME_SEC;
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
 
                mark_inode_dirty(inode);
 setflags_out:
@@ -102,10 +102,10 @@ setflags_out:
                        goto setversion_out;
                }
 
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
                inode->i_ctime = CURRENT_TIME_SEC;
                inode->i_generation = generation;
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
 
                mark_inode_dirty(inode);
 setversion_out:
index 1a0835073663ff3f1dad1f376328de5e2b1332ce..c8021208a7eb1a98ba32e16a8536146f570d5c4a 100644 (file)
@@ -384,14 +384,12 @@ int ext4_decrypt(struct page *page)
                                EXT4_DECRYPT, page->index, page, page);
 }
 
-int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex)
+int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk,
+                          ext4_fsblk_t pblk, ext4_lblk_t len)
 {
        struct ext4_crypto_ctx  *ctx;
        struct page             *ciphertext_page = NULL;
        struct bio              *bio;
-       ext4_lblk_t             lblk = le32_to_cpu(ex->ee_block);
-       ext4_fsblk_t            pblk = ext4_ext_pblock(ex);
-       unsigned int            len = ext4_ext_get_actual_len(ex);
        int                     ret, err = 0;
 
 #if 0
index c5882b36e5582d0005582d0fe3ac86cab70d9c9c..9a16d1e75a493f9e4663bb4ea05b9da9980ba65d 100644 (file)
@@ -213,9 +213,11 @@ retry:
                res = -ENOKEY;
                goto out;
        }
+       down_read(&keyring_key->sem);
        ukp = user_key_payload(keyring_key);
        if (ukp->datalen != sizeof(struct ext4_encryption_key)) {
                res = -EINVAL;
+               up_read(&keyring_key->sem);
                goto out;
        }
        master_key = (struct ext4_encryption_key *)ukp->data;
@@ -226,10 +228,12 @@ retry:
                            "ext4: key size incorrect: %d\n",
                            master_key->size);
                res = -ENOKEY;
+               up_read(&keyring_key->sem);
                goto out;
        }
        res = ext4_derive_key_aes(ctx.nonce, master_key->raw,
                                  raw_key);
+       up_read(&keyring_key->sem);
        if (res)
                goto out;
 got_key:
index cc7ca4e87144a540332213ce03b63e80e601f40e..0662b285dc8a71982a54e5895d58d797c7bcf6a4 100644 (file)
@@ -378,14 +378,22 @@ struct flex_groups {
 #define EXT4_PROJINHERIT_FL            0x20000000 /* Create with parents projid */
 #define EXT4_RESERVED_FL               0x80000000 /* reserved for ext4 lib */
 
-#define EXT4_FL_USER_VISIBLE           0x004BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE                0x004380FF /* User modifiable flags */
+#define EXT4_FL_USER_VISIBLE           0x304BDFFF /* User visible flags */
+#define EXT4_FL_USER_MODIFIABLE                0x204380FF /* User modifiable flags */
+
+#define EXT4_FL_XFLAG_VISIBLE          (EXT4_SYNC_FL | \
+                                        EXT4_IMMUTABLE_FL | \
+                                        EXT4_APPEND_FL | \
+                                        EXT4_NODUMP_FL | \
+                                        EXT4_NOATIME_FL | \
+                                        EXT4_PROJINHERIT_FL)
 
 /* Flags that should be inherited by new inodes from their parent. */
 #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
                           EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
                           EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
-                          EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
+                          EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\
+                          EXT4_PROJINHERIT_FL)
 
 /* Flags that are appropriate for regular files (all but dir-specific ones). */
 #define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL))
@@ -555,10 +563,12 @@ enum {
 #define EXT4_GET_BLOCKS_NO_NORMALIZE           0x0040
        /* Request will not result in inode size update (user for fallocate) */
 #define EXT4_GET_BLOCKS_KEEP_SIZE              0x0080
-       /* Do not take i_data_sem locking in ext4_map_blocks */
-#define EXT4_GET_BLOCKS_NO_LOCK                        0x0100
        /* Convert written extents to unwritten */
-#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN      0x0200
+#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN      0x0100
+       /* Write zeros to newly created written extents */
+#define EXT4_GET_BLOCKS_ZERO                   0x0200
+#define EXT4_GET_BLOCKS_CREATE_ZERO            (EXT4_GET_BLOCKS_CREATE |\
+                                       EXT4_GET_BLOCKS_ZERO)
 
 /*
  * The bit position of these flags must not overlap with any of the
@@ -616,6 +626,46 @@ enum {
 #define EXT4_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16])
 #define EXT4_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct ext4_encryption_policy)
 
+#ifndef FS_IOC_FSGETXATTR
+/* Until the uapi changes get merged for project quota... */
+
+#define FS_IOC_FSGETXATTR              _IOR('X', 31, struct fsxattr)
+#define FS_IOC_FSSETXATTR              _IOW('X', 32, struct fsxattr)
+
+/*
+ * Structure for FS_IOC_FSGETXATTR and FS_IOC_FSSETXATTR.
+ */
+struct fsxattr {
+       __u32           fsx_xflags;     /* xflags field value (get/set) */
+       __u32           fsx_extsize;    /* extsize field value (get/set)*/
+       __u32           fsx_nextents;   /* nextents field value (get)   */
+       __u32           fsx_projid;     /* project identifier (get/set) */
+       unsigned char   fsx_pad[12];
+};
+
+/*
+ * Flags for the fsx_xflags field
+ */
+#define FS_XFLAG_REALTIME      0x00000001      /* data in realtime volume */
+#define FS_XFLAG_PREALLOC      0x00000002      /* preallocated file extents */
+#define FS_XFLAG_IMMUTABLE     0x00000008      /* file cannot be modified */
+#define FS_XFLAG_APPEND                0x00000010      /* all writes append */
+#define FS_XFLAG_SYNC          0x00000020      /* all writes synchronous */
+#define FS_XFLAG_NOATIME       0x00000040      /* do not update access time */
+#define FS_XFLAG_NODUMP                0x00000080      /* do not include in backups */
+#define FS_XFLAG_RTINHERIT     0x00000100      /* create with rt bit set */
+#define FS_XFLAG_PROJINHERIT   0x00000200      /* create with parents projid */
+#define FS_XFLAG_NOSYMLINKS    0x00000400      /* disallow symlink creation */
+#define FS_XFLAG_EXTSIZE       0x00000800      /* extent size allocator hint */
+#define FS_XFLAG_EXTSZINHERIT  0x00001000      /* inherit inode extent size */
+#define FS_XFLAG_NODEFRAG      0x00002000      /* do not defragment */
+#define FS_XFLAG_FILESTREAM    0x00004000      /* use filestream allocator */
+#define FS_XFLAG_HASATTR       0x80000000      /* no DIFLAG for this */
+#endif /* !defined(FS_IOC_FSGETXATTR) */
+
+#define EXT4_IOC_FSGETXATTR            FS_IOC_FSGETXATTR
+#define EXT4_IOC_FSSETXATTR            FS_IOC_FSSETXATTR
+
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
  * ioctl commands in 32 bit emulation
@@ -910,6 +960,15 @@ struct ext4_inode_info {
         * by other means, so we have i_data_sem.
         */
        struct rw_semaphore i_data_sem;
+       /*
+        * i_mmap_sem is for serializing page faults with truncate / punch hole
+        * operations. We have to make sure that new page cannot be faulted in
+        * a section of the inode that is being punched. We cannot easily use
+        * i_data_sem for this since we need protection for the whole punch
+        * operation and i_data_sem ranks below transaction start so we have
+        * to occasionally drop it.
+        */
+       struct rw_semaphore i_mmap_sem;
        struct inode vfs_inode;
        struct jbd2_inode *jinode;
 
@@ -993,6 +1052,7 @@ struct ext4_inode_info {
        /* Encryption params */
        struct ext4_crypt_info *i_crypt_info;
 #endif
+       kprojid_t i_projid;
 };
 
 /*
@@ -1248,7 +1308,7 @@ struct ext4_super_block {
 #endif
 
 /* Number of quota types we support */
-#define EXT4_MAXQUOTAS 2
+#define EXT4_MAXQUOTAS 3
 
 /*
  * fourth extended-fs super-block data in memory
@@ -1754,7 +1814,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt,              ENCRYPT)
                                         EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
                                         EXT4_FEATURE_RO_COMPAT_BIGALLOC |\
                                         EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
-                                        EXT4_FEATURE_RO_COMPAT_QUOTA)
+                                        EXT4_FEATURE_RO_COMPAT_QUOTA |\
+                                        EXT4_FEATURE_RO_COMPAT_PROJECT)
 
 #define EXTN_FEATURE_FUNCS(ver) \
 static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \
@@ -1796,6 +1857,11 @@ static inline bool ext4_has_incompat_features(struct super_block *sb)
 #define        EXT4_DEF_RESUID         0
 #define        EXT4_DEF_RESGID         0
 
+/*
+ * Default project ID
+ */
+#define        EXT4_DEF_PROJID         0
+
 #define EXT4_DEF_INODE_READAHEAD_BLKS  32
 
 /*
@@ -2234,7 +2300,8 @@ void ext4_restore_control_page(struct page *data_page);
 struct page *ext4_encrypt(struct inode *inode,
                          struct page *plaintext_page);
 int ext4_decrypt(struct page *page);
-int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex);
+int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk,
+                          ext4_fsblk_t pblk, ext4_lblk_t len);
 
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
 int ext4_init_crypto(void);
@@ -2440,8 +2507,8 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
 struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
 int ext4_get_block_write(struct inode *inode, sector_t iblock,
                         struct buffer_head *bh_result, int create);
-int ext4_get_block_dax(struct inode *inode, sector_t iblock,
-                        struct buffer_head *bh_result, int create);
+int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
+                           struct buffer_head *bh_result, int create);
 int ext4_get_block(struct inode *inode, sector_t iblock,
                                struct buffer_head *bh_result, int create);
 int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
@@ -2484,9 +2551,13 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
                             loff_t lstart, loff_t lend);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
+extern int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
+extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
 extern void ext4_da_update_reserve_space(struct inode *inode,
                                        int used, int quota_claim);
+extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
+                             ext4_fsblk_t pblk, ext4_lblk_t len);
 
 /* indirect.c */
 extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
@@ -2825,7 +2896,7 @@ do {                                                              \
 static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
 {
        WARN_ON_ONCE(S_ISREG(inode->i_mode) &&
-                    !mutex_is_locked(&inode->i_mutex));
+                    !inode_is_locked(inode));
        down_write(&EXT4_I(inode)->i_data_sem);
        if (newsize > EXT4_I(inode)->i_disksize)
                EXT4_I(inode)->i_disksize = newsize;
@@ -2848,6 +2919,9 @@ static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
        return changed;
 }
 
+int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
+                                     loff_t len);
+
 struct ext4_group_info {
        unsigned long   bb_state;
        struct rb_root  bb_free_root;
@@ -2986,8 +3060,7 @@ extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
                                         struct page *page);
 extern int ext4_try_add_inline_entry(handle_t *handle,
                                     struct ext4_filename *fname,
-                                    struct dentry *dentry,
-                                    struct inode *inode);
+                                    struct inode *dir, struct inode *inode);
 extern int ext4_try_create_inline_dir(handle_t *handle,
                                      struct inode *parent,
                                      struct inode *inode);
index 551353b1b17ab930ead8eff62ee3a699b4cfe10f..0ffabaf90aa5d19914aaf9e3cbeca44922f15f86 100644 (file)
@@ -3119,19 +3119,11 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 {
        ext4_fsblk_t ee_pblock;
        unsigned int ee_len;
-       int ret;
 
        ee_len    = ext4_ext_get_actual_len(ex);
        ee_pblock = ext4_ext_pblock(ex);
-
-       if (ext4_encrypted_inode(inode))
-               return ext4_encrypted_zeroout(inode, ex);
-
-       ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
-       if (ret > 0)
-               ret = 0;
-
-       return ret;
+       return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), ee_pblock,
+                                 ee_len);
 }
 
 /*
@@ -4052,6 +4044,14 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
        }
        /* IO end_io complete, convert the filled extent to written */
        if (flags & EXT4_GET_BLOCKS_CONVERT) {
+               if (flags & EXT4_GET_BLOCKS_ZERO) {
+                       if (allocated > map->m_len)
+                               allocated = map->m_len;
+                       err = ext4_issue_zeroout(inode, map->m_lblk, newblock,
+                                                allocated);
+                       if (err < 0)
+                               goto out2;
+               }
                ret = ext4_convert_unwritten_extents_endio(handle, inode, map,
                                                           ppath);
                if (ret >= 0) {
@@ -4685,10 +4685,6 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
        if (len <= EXT_UNWRITTEN_MAX_LEN)
                flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
 
-       /* Wait all existing dio workers, newcomers will block on i_mutex */
-       ext4_inode_block_unlocked_dio(inode);
-       inode_dio_wait(inode);
-
        /*
         * credits to insert 1 extent into extent tree
         */
@@ -4752,8 +4748,6 @@ retry:
                goto retry;
        }
 
-       ext4_inode_resume_unlocked_dio(inode);
-
        return ret > 0 ? ret2 : ret;
 }
 
@@ -4770,7 +4764,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
        int partial_begin, partial_end;
        loff_t start, end;
        ext4_lblk_t lblk;
-       struct address_space *mapping = inode->i_mapping;
        unsigned int blkbits = inode->i_blkbits;
 
        trace_ext4_zero_range(inode, offset, len, mode);
@@ -4785,17 +4778,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
                        return ret;
        }
 
-       /*
-        * Write out all dirty pages to avoid race conditions
-        * Then release them.
-        */
-       if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
-               ret = filemap_write_and_wait_range(mapping, offset,
-                                                  offset + len - 1);
-               if (ret)
-                       return ret;
-       }
-
        /*
         * Round up offset. This is not fallocate, we neet to zero out
         * blocks, so convert interior block aligned part of the range to
@@ -4817,7 +4799,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
        else
                max_blocks -= lblk;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        /*
         * Indirect files do not support unwritten extnets
@@ -4839,6 +4821,10 @@ static long ext4_zero_range(struct file *file, loff_t offset,
        if (mode & FALLOC_FL_KEEP_SIZE)
                flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
 
+       /* Wait all existing dio workers, newcomers will block on i_mutex */
+       ext4_inode_block_unlocked_dio(inode);
+       inode_dio_wait(inode);
+
        /* Preallocate the range including the unaligned edges */
        if (partial_begin || partial_end) {
                ret = ext4_alloc_file_blocks(file,
@@ -4847,7 +4833,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
                                 round_down(offset, 1 << blkbits)) >> blkbits,
                                new_size, flags, mode);
                if (ret)
-                       goto out_mutex;
+                       goto out_dio;
 
        }
 
@@ -4856,16 +4842,23 @@ static long ext4_zero_range(struct file *file, loff_t offset,
                flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
                          EXT4_EX_NOCACHE);
 
-               /* Now release the pages and zero block aligned part of pages*/
+               /*
+                * Prevent page faults from reinstantiating pages we have
+                * released from page cache.
+                */
+               down_write(&EXT4_I(inode)->i_mmap_sem);
+               ret = ext4_update_disksize_before_punch(inode, offset, len);
+               if (ret) {
+                       up_write(&EXT4_I(inode)->i_mmap_sem);
+                       goto out_dio;
+               }
+               /* Now release the pages and zero block aligned part of pages */
                truncate_pagecache_range(inode, start, end - 1);
                inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
 
-               /* Wait all existing dio workers, newcomers will block on i_mutex */
-               ext4_inode_block_unlocked_dio(inode);
-               inode_dio_wait(inode);
-
                ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
                                             flags, mode);
+               up_write(&EXT4_I(inode)->i_mmap_sem);
                if (ret)
                        goto out_dio;
        }
@@ -4909,7 +4902,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 out_dio:
        ext4_inode_resume_unlocked_dio(inode);
 out_mutex:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return ret;
 }
 
@@ -4980,7 +4973,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
        if (mode & FALLOC_FL_KEEP_SIZE)
                flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        /*
         * We only support preallocation for extent-based files only
@@ -4998,8 +4991,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
                        goto out;
        }
 
+       /* Wait all existing dio workers, newcomers will block on i_mutex */
+       ext4_inode_block_unlocked_dio(inode);
+       inode_dio_wait(inode);
+
        ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
                                     flags, mode);
+       ext4_inode_resume_unlocked_dio(inode);
        if (ret)
                goto out;
 
@@ -5008,7 +5006,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
                                                EXT4_I(inode)->i_sync_tid);
        }
 out:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
        return ret;
 }
@@ -5494,21 +5492,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
                        return ret;
        }
 
-       /*
-        * Need to round down offset to be aligned with page size boundary
-        * for page size > block size.
-        */
-       ioffset = round_down(offset, PAGE_SIZE);
-
-       /* Write out all dirty pages */
-       ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
-                                          LLONG_MAX);
-       if (ret)
-               return ret;
-
-       /* Take mutex lock */
-       mutex_lock(&inode->i_mutex);
-
+       inode_lock(inode);
        /*
         * There is no need to overlap collapse range with EOF, in which case
         * it is effectively a truncate operation
@@ -5524,17 +5508,43 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
                goto out_mutex;
        }
 
-       truncate_pagecache(inode, ioffset);
-
        /* Wait for existing dio to complete */
        ext4_inode_block_unlocked_dio(inode);
        inode_dio_wait(inode);
 
+       /*
+        * Prevent page faults from reinstantiating pages we have released from
+        * page cache.
+        */
+       down_write(&EXT4_I(inode)->i_mmap_sem);
+       /*
+        * Need to round down offset to be aligned with page size boundary
+        * for page size > block size.
+        */
+       ioffset = round_down(offset, PAGE_SIZE);
+       /*
+        * Write tail of the last page before removed range since it will get
+        * removed from the page cache below.
+        */
+       ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset);
+       if (ret)
+               goto out_mmap;
+       /*
+        * Write data that will be shifted to preserve them when discarding
+        * page cache below. We are also protected from pages becoming dirty
+        * by i_mmap_sem.
+        */
+       ret = filemap_write_and_wait_range(inode->i_mapping, offset + len,
+                                          LLONG_MAX);
+       if (ret)
+               goto out_mmap;
+       truncate_pagecache(inode, ioffset);
+
        credits = ext4_writepage_trans_blocks(inode);
        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
-               goto out_dio;
+               goto out_mmap;
        }
 
        down_write(&EXT4_I(inode)->i_data_sem);
@@ -5573,10 +5583,11 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 
 out_stop:
        ext4_journal_stop(handle);
-out_dio:
+out_mmap:
+       up_write(&EXT4_I(inode)->i_mmap_sem);
        ext4_inode_resume_unlocked_dio(inode);
 out_mutex:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return ret;
 }
 
@@ -5627,21 +5638,7 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
                        return ret;
        }
 
-       /*
-        * Need to round down to align start offset to page size boundary
-        * for page size > block size.
-        */
-       ioffset = round_down(offset, PAGE_SIZE);
-
-       /* Write out all dirty pages */
-       ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
-                       LLONG_MAX);
-       if (ret)
-               return ret;
-
-       /* Take mutex lock */
-       mutex_lock(&inode->i_mutex);
-
+       inode_lock(inode);
        /* Currently just for extent based files */
        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                ret = -EOPNOTSUPP;
@@ -5660,17 +5657,32 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
                goto out_mutex;
        }
 
-       truncate_pagecache(inode, ioffset);
-
        /* Wait for existing dio to complete */
        ext4_inode_block_unlocked_dio(inode);
        inode_dio_wait(inode);
 
+       /*
+        * Prevent page faults from reinstantiating pages we have released from
+        * page cache.
+        */
+       down_write(&EXT4_I(inode)->i_mmap_sem);
+       /*
+        * Need to round down to align start offset to page size boundary
+        * for page size > block size.
+        */
+       ioffset = round_down(offset, PAGE_SIZE);
+       /* Write out all dirty pages */
+       ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
+                       LLONG_MAX);
+       if (ret)
+               goto out_mmap;
+       truncate_pagecache(inode, ioffset);
+
        credits = ext4_writepage_trans_blocks(inode);
        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
-               goto out_dio;
+               goto out_mmap;
        }
 
        /* Expand file to avoid data loss if there is error while shifting */
@@ -5741,10 +5753,11 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
 
 out_stop:
        ext4_journal_stop(handle);
-out_dio:
+out_mmap:
+       up_write(&EXT4_I(inode)->i_mmap_sem);
        ext4_inode_resume_unlocked_dio(inode);
 out_mutex:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return ret;
 }
 
@@ -5779,8 +5792,8 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
 
        BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
        BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
-       BUG_ON(!mutex_is_locked(&inode1->i_mutex));
-       BUG_ON(!mutex_is_locked(&inode2->i_mutex));
+       BUG_ON(!inode_is_locked(inode1));
+       BUG_ON(!inode_is_locked(inode2));
 
        *erp = ext4_es_remove_extent(inode1, lblk1, count);
        if (unlikely(*erp))
index 113837e7ba98d5cf866ee365a40a89d7fc781a71..1126436dada19519b97240bcdb42995acae46724 100644 (file)
@@ -113,7 +113,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
                ext4_unwritten_wait(inode);
        }
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        ret = generic_write_checks(iocb, from);
        if (ret <= 0)
                goto out;
@@ -169,7 +169,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
        }
 
        ret = __generic_file_write_iter(iocb, from);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        if (ret > 0) {
                ssize_t err;
@@ -186,50 +186,42 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
        return ret;
 
 out:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        if (aio_mutex)
                mutex_unlock(aio_mutex);
        return ret;
 }
 
 #ifdef CONFIG_FS_DAX
-static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
-{
-       struct inode *inode = bh->b_assoc_map->host;
-       /* XXX: breaks on 32-bit > 16TB. Is that even supported? */
-       loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
-       int err;
-       if (!uptodate)
-               return;
-       WARN_ON(!buffer_unwritten(bh));
-       err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
-}
-
 static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        int result;
        handle_t *handle = NULL;
-       struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+       struct inode *inode = file_inode(vma->vm_file);
+       struct super_block *sb = inode->i_sb;
        bool write = vmf->flags & FAULT_FLAG_WRITE;
 
        if (write) {
                sb_start_pagefault(sb);
                file_update_time(vma->vm_file);
+               down_read(&EXT4_I(inode)->i_mmap_sem);
                handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
                                                EXT4_DATA_TRANS_BLOCKS(sb));
-       }
+       } else
+               down_read(&EXT4_I(inode)->i_mmap_sem);
 
        if (IS_ERR(handle))
                result = VM_FAULT_SIGBUS;
        else
-               result = __dax_fault(vma, vmf, ext4_get_block_dax,
-                                               ext4_end_io_unwritten);
+               result = __dax_fault(vma, vmf, ext4_dax_mmap_get_block, NULL);
 
        if (write) {
                if (!IS_ERR(handle))
                        ext4_journal_stop(handle);
+               up_read(&EXT4_I(inode)->i_mmap_sem);
                sb_end_pagefault(sb);
-       }
+       } else
+               up_read(&EXT4_I(inode)->i_mmap_sem);
 
        return result;
 }
@@ -246,44 +238,88 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
        if (write) {
                sb_start_pagefault(sb);
                file_update_time(vma->vm_file);
+               down_read(&EXT4_I(inode)->i_mmap_sem);
                handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
                                ext4_chunk_trans_blocks(inode,
                                                        PMD_SIZE / PAGE_SIZE));
-       }
+       } else
+               down_read(&EXT4_I(inode)->i_mmap_sem);
 
        if (IS_ERR(handle))
                result = VM_FAULT_SIGBUS;
        else
                result = __dax_pmd_fault(vma, addr, pmd, flags,
-                               ext4_get_block_dax, ext4_end_io_unwritten);
+                               ext4_dax_mmap_get_block, NULL);
 
        if (write) {
                if (!IS_ERR(handle))
                        ext4_journal_stop(handle);
+               up_read(&EXT4_I(inode)->i_mmap_sem);
                sb_end_pagefault(sb);
-       }
+       } else
+               up_read(&EXT4_I(inode)->i_mmap_sem);
 
        return result;
 }
 
 static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-       return dax_mkwrite(vma, vmf, ext4_get_block_dax,
-                               ext4_end_io_unwritten);
+       int err;
+       struct inode *inode = file_inode(vma->vm_file);
+
+       sb_start_pagefault(inode->i_sb);
+       file_update_time(vma->vm_file);
+       down_read(&EXT4_I(inode)->i_mmap_sem);
+       err = __dax_mkwrite(vma, vmf, ext4_dax_mmap_get_block, NULL);
+       up_read(&EXT4_I(inode)->i_mmap_sem);
+       sb_end_pagefault(inode->i_sb);
+
+       return err;
+}
+
+/*
+ * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_mkwrite()
+ * handler we check for races agaist truncate. Note that since we cycle through
+ * i_mmap_sem, we are sure that also any hole punching that began before we
+ * were called is finished by now and so if it included part of the file we
+ * are working on, our pte will get unmapped and the check for pte_same() in
+ * wp_pfn_shared() fails. Thus fault gets retried and things work out as
+ * desired.
+ */
+static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma,
+                               struct vm_fault *vmf)
+{
+       struct inode *inode = file_inode(vma->vm_file);
+       struct super_block *sb = inode->i_sb;
+       loff_t size;
+       int ret;
+
+       sb_start_pagefault(sb);
+       file_update_time(vma->vm_file);
+       down_read(&EXT4_I(inode)->i_mmap_sem);
+       size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+       if (vmf->pgoff >= size)
+               ret = VM_FAULT_SIGBUS;
+       else
+               ret = dax_pfn_mkwrite(vma, vmf);
+       up_read(&EXT4_I(inode)->i_mmap_sem);
+       sb_end_pagefault(sb);
+
+       return ret;
 }
 
 static const struct vm_operations_struct ext4_dax_vm_ops = {
        .fault          = ext4_dax_fault,
        .pmd_fault      = ext4_dax_pmd_fault,
        .page_mkwrite   = ext4_dax_mkwrite,
-       .pfn_mkwrite    = dax_pfn_mkwrite,
+       .pfn_mkwrite    = ext4_dax_pfn_mkwrite,
 };
 #else
 #define ext4_dax_vm_ops        ext4_file_vm_ops
 #endif
 
 static const struct vm_operations_struct ext4_file_vm_ops = {
-       .fault          = filemap_fault,
+       .fault          = ext4_filemap_fault,
        .map_pages      = filemap_map_pages,
        .page_mkwrite   = ext4_page_mkwrite,
 };
@@ -527,11 +563,11 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
        int blkbits;
        int ret = 0;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        isize = i_size_read(inode);
        if (offset >= isize) {
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
                return -ENXIO;
        }
 
@@ -579,7 +615,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
                dataoff = (loff_t)last << blkbits;
        } while (last <= end);
 
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        if (dataoff > isize)
                return -ENXIO;
@@ -600,11 +636,11 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
        int blkbits;
        int ret = 0;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        isize = i_size_read(inode);
        if (offset >= isize) {
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
                return -ENXIO;
        }
 
@@ -655,7 +691,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
                break;
        } while (last <= end);
 
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        if (holeoff > isize)
                holeoff = isize;
index 1b8024d26f654c5458c7e84db5a2c439adc6500e..3fcfd50a2e8a0c05315d823fa1219bea2c87f6c8 100644 (file)
@@ -799,6 +799,13 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
                inode->i_gid = dir->i_gid;
        } else
                inode_init_owner(inode, dir, mode);
+
+       if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) &&
+           ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT))
+               ei->i_projid = EXT4_I(dir)->i_projid;
+       else
+               ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID);
+
        err = dquot_initialize(inode);
        if (err)
                goto out;
index d884989cc83dd99238a710f8131ab38b1139c7ca..dfe3b9bafc0d2b9cb4957c189290f8da68fb4dc4 100644 (file)
@@ -995,12 +995,11 @@ void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
  */
 static int ext4_add_dirent_to_inline(handle_t *handle,
                                     struct ext4_filename *fname,
-                                    struct dentry *dentry,
+                                    struct inode *dir,
                                     struct inode *inode,
                                     struct ext4_iloc *iloc,
                                     void *inline_start, int inline_size)
 {
-       struct inode    *dir = d_inode(dentry->d_parent);
        int             err;
        struct ext4_dir_entry_2 *de;
 
@@ -1245,12 +1244,11 @@ out:
  * the new created block.
  */
 int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname,
-                             struct dentry *dentry, struct inode *inode)
+                             struct inode *dir, struct inode *inode)
 {
        int ret, inline_size;
        void *inline_start;
        struct ext4_iloc iloc;
-       struct inode *dir = d_inode(dentry->d_parent);
 
        ret = ext4_get_inode_loc(dir, &iloc);
        if (ret)
@@ -1264,7 +1262,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname,
                                                 EXT4_INLINE_DOTDOT_SIZE;
        inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
 
-       ret = ext4_add_dirent_to_inline(handle, fname, dentry, inode, &iloc,
+       ret = ext4_add_dirent_to_inline(handle, fname, dir, inode, &iloc,
                                        inline_start, inline_size);
        if (ret != -ENOSPC)
                goto out;
@@ -1285,7 +1283,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname,
        if (inline_size) {
                inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
 
-               ret = ext4_add_dirent_to_inline(handle, fname, dentry,
+               ret = ext4_add_dirent_to_inline(handle, fname, dir,
                                                inode, &iloc, inline_start,
                                                inline_size);
 
index b3bd912df6bfaf475f9304eba7fb1b9ce6368e87..83bc8bfb3bea8eeefed38ca46ae8260779222405 100644 (file)
@@ -383,6 +383,21 @@ static int __check_block_validity(struct inode *inode, const char *func,
        return 0;
 }
 
+int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
+                      ext4_lblk_t len)
+{
+       int ret;
+
+       if (ext4_encrypted_inode(inode))
+               return ext4_encrypted_zeroout(inode, lblk, pblk, len);
+
+       ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS);
+       if (ret > 0)
+               ret = 0;
+
+       return ret;
+}
+
 #define check_block_validity(inode, map)       \
        __check_block_validity((inode), __func__, __LINE__, (map))
 
@@ -403,8 +418,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
         * out taking i_data_sem.  So at the time the unwritten extent
         * could be converted.
         */
-       if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
-               down_read(&EXT4_I(inode)->i_data_sem);
+       down_read(&EXT4_I(inode)->i_data_sem);
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                retval = ext4_ext_map_blocks(handle, inode, map, flags &
                                             EXT4_GET_BLOCKS_KEEP_SIZE);
@@ -412,8 +426,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
                retval = ext4_ind_map_blocks(handle, inode, map, flags &
                                             EXT4_GET_BLOCKS_KEEP_SIZE);
        }
-       if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
-               up_read((&EXT4_I(inode)->i_data_sem));
+       up_read((&EXT4_I(inode)->i_data_sem));
 
        /*
         * We don't check m_len because extent will be collpased in status
@@ -509,8 +522,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
         * Try to see if we can get the block without requesting a new
         * file system block.
         */
-       if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
-               down_read(&EXT4_I(inode)->i_data_sem);
+       down_read(&EXT4_I(inode)->i_data_sem);
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                retval = ext4_ext_map_blocks(handle, inode, map, flags &
                                             EXT4_GET_BLOCKS_KEEP_SIZE);
@@ -541,8 +553,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
                if (ret < 0)
                        retval = ret;
        }
-       if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
-               up_read((&EXT4_I(inode)->i_data_sem));
+       up_read((&EXT4_I(inode)->i_data_sem));
 
 found:
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
@@ -625,6 +636,22 @@ found:
                        WARN_ON(1);
                }
 
+               /*
+                * We have to zeroout blocks before inserting them into extent
+                * status tree. Otherwise someone could look them up there and
+                * use them before they are really zeroed.
+                */
+               if (flags & EXT4_GET_BLOCKS_ZERO &&
+                   map->m_flags & EXT4_MAP_MAPPED &&
+                   map->m_flags & EXT4_MAP_NEW) {
+                       ret = ext4_issue_zeroout(inode, map->m_lblk,
+                                                map->m_pblk, map->m_len);
+                       if (ret) {
+                               retval = ret;
+                               goto out_sem;
+                       }
+               }
+
                /*
                 * If the extent has been zeroed out, we don't need to update
                 * extent status tree.
@@ -632,7 +659,7 @@ found:
                if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
                    ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
                        if (ext4_es_is_written(&es))
-                               goto has_zeroout;
+                               goto out_sem;
                }
                status = map->m_flags & EXT4_MAP_UNWRITTEN ?
                                EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
@@ -643,11 +670,13 @@ found:
                        status |= EXTENT_STATUS_DELAYED;
                ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
                                            map->m_pblk, status);
-               if (ret < 0)
+               if (ret < 0) {
                        retval = ret;
+                       goto out_sem;
+               }
        }
 
-has_zeroout:
+out_sem:
        up_write((&EXT4_I(inode)->i_data_sem));
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
                ret = check_block_validity(inode, map);
@@ -674,7 +703,7 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
        map.m_lblk = iblock;
        map.m_len = bh->b_size >> inode->i_blkbits;
 
-       if (flags && !(flags & EXT4_GET_BLOCKS_NO_LOCK) && !handle) {
+       if (flags && !handle) {
                /* Direct IO write... */
                if (map.m_len > DIO_MAX_BLOCKS)
                        map.m_len = DIO_MAX_BLOCKS;
@@ -694,16 +723,6 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
 
                map_bh(bh, inode->i_sb, map.m_pblk);
                bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
-               if (IS_DAX(inode) && buffer_unwritten(bh)) {
-                       /*
-                        * dgc: I suspect unwritten conversion on ext4+DAX is
-                        * fundamentally broken here when there are concurrent
-                        * read/write in progress on this inode.
-                        */
-                       WARN_ON_ONCE(io_end);
-                       bh->b_assoc_map = inode->i_mapping;
-                       bh->b_private = (void *)(unsigned long)iblock;
-               }
                if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
                        set_buffer_defer_completion(bh);
                bh->b_size = inode->i_sb->s_blocksize * map.m_len;
@@ -879,9 +898,6 @@ int do_journal_get_write_access(handle_t *handle,
        return ret;
 }
 
-static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
-                  struct buffer_head *bh_result, int create);
-
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
 static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
                                  get_block_t *get_block)
@@ -3054,25 +3070,96 @@ int ext4_get_block_write(struct inode *inode, sector_t iblock,
                               EXT4_GET_BLOCKS_IO_CREATE_EXT);
 }
 
-static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
+static int ext4_get_block_overwrite(struct inode *inode, sector_t iblock,
                   struct buffer_head *bh_result, int create)
 {
-       ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n",
+       int ret;
+
+       ext4_debug("ext4_get_block_overwrite: inode %lu, create flag %d\n",
                   inode->i_ino, create);
-       return _ext4_get_block(inode, iblock, bh_result,
-                              EXT4_GET_BLOCKS_NO_LOCK);
+       ret = _ext4_get_block(inode, iblock, bh_result, 0);
+       /*
+        * Blocks should have been preallocated! ext4_file_write_iter() checks
+        * that.
+        */
+       WARN_ON_ONCE(!buffer_mapped(bh_result));
+
+       return ret;
 }
 
-int ext4_get_block_dax(struct inode *inode, sector_t iblock,
-                  struct buffer_head *bh_result, int create)
+#ifdef CONFIG_FS_DAX
+int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
+                           struct buffer_head *bh_result, int create)
 {
-       int flags = EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_UNWRIT_EXT;
-       if (create)
-               flags |= EXT4_GET_BLOCKS_CREATE;
-       ext4_debug("ext4_get_block_dax: inode %lu, create flag %d\n",
+       int ret, err;
+       int credits;
+       struct ext4_map_blocks map;
+       handle_t *handle = NULL;
+       int flags = 0;
+
+       ext4_debug("ext4_dax_mmap_get_block: inode %lu, create flag %d\n",
                   inode->i_ino, create);
-       return _ext4_get_block(inode, iblock, bh_result, flags);
+       map.m_lblk = iblock;
+       map.m_len = bh_result->b_size >> inode->i_blkbits;
+       credits = ext4_chunk_trans_blocks(inode, map.m_len);
+       if (create) {
+               flags |= EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_CREATE_ZERO;
+               handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
+               if (IS_ERR(handle)) {
+                       ret = PTR_ERR(handle);
+                       return ret;
+               }
+       }
+
+       ret = ext4_map_blocks(handle, inode, &map, flags);
+       if (create) {
+               err = ext4_journal_stop(handle);
+               if (ret >= 0 && err < 0)
+                       ret = err;
+       }
+       if (ret <= 0)
+               goto out;
+       if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+               int err2;
+
+               /*
+                * We are protected by i_mmap_sem so we know block cannot go
+                * away from under us even though we dropped i_data_sem.
+                * Convert extent to written and write zeros there.
+                *
+                * Note: We may get here even when create == 0.
+                */
+               handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
+               if (IS_ERR(handle)) {
+                       ret = PTR_ERR(handle);
+                       goto out;
+               }
+
+               err = ext4_map_blocks(handle, inode, &map,
+                     EXT4_GET_BLOCKS_CONVERT | EXT4_GET_BLOCKS_CREATE_ZERO);
+               if (err < 0)
+                       ret = err;
+               err2 = ext4_journal_stop(handle);
+               if (err2 < 0 && ret > 0)
+                       ret = err2;
+       }
+out:
+       WARN_ON_ONCE(ret == 0 && create);
+       if (ret > 0) {
+               map_bh(bh_result, inode->i_sb, map.m_pblk);
+               bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) |
+                                       map.m_flags;
+               /*
+                * At least for now we have to clear BH_New so that DAX code
+                * doesn't attempt to zero blocks again in a racy way.
+                */
+               bh_result->b_state &= ~(1 << BH_New);
+               bh_result->b_size = map.m_len << inode->i_blkbits;
+               ret = 0;
+       }
+       return ret;
 }
+#endif
 
 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
                            ssize_t size, void *private)
@@ -3143,10 +3230,8 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
        /* If we do a overwrite dio, i_mutex locking can be released */
        overwrite = *((int *)iocb->private);
 
-       if (overwrite) {
-               down_read(&EXT4_I(inode)->i_data_sem);
-               mutex_unlock(&inode->i_mutex);
-       }
+       if (overwrite)
+               inode_unlock(inode);
 
        /*
         * We could direct write to holes and fallocate.
@@ -3189,7 +3274,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
        }
 
        if (overwrite) {
-               get_block_func = ext4_get_block_write_nolock;
+               get_block_func = ext4_get_block_overwrite;
        } else {
                get_block_func = ext4_get_block_write;
                dio_flags = DIO_LOCKING;
@@ -3245,10 +3330,8 @@ retake_lock:
        if (iov_iter_rw(iter) == WRITE)
                inode_dio_end(inode);
        /* take i_mutex locking again if we do a ovewrite dio */
-       if (overwrite) {
-               up_read(&EXT4_I(inode)->i_data_sem);
-               mutex_lock(&inode->i_mutex);
-       }
+       if (overwrite)
+               inode_lock(inode);
 
        return ret;
 }
@@ -3558,6 +3641,35 @@ int ext4_can_truncate(struct inode *inode)
        return 0;
 }
 
+/*
+ * We have to make sure i_disksize gets properly updated before we truncate
+ * page cache due to hole punching or zero range. Otherwise i_disksize update
+ * can get lost as it may have been postponed to submission of writeback but
+ * that will never happen after we truncate page cache.
+ */
+int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
+                                     loff_t len)
+{
+       handle_t *handle;
+       loff_t size = i_size_read(inode);
+
+       WARN_ON(!inode_is_locked(inode));
+       if (offset > size || offset + len < size)
+               return 0;
+
+       if (EXT4_I(inode)->i_disksize >= size)
+               return 0;
+
+       handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
+       if (IS_ERR(handle))
+               return PTR_ERR(handle);
+       ext4_update_i_disksize(inode, size);
+       ext4_mark_inode_dirty(handle, inode);
+       ext4_journal_stop(handle);
+
+       return 0;
+}
+
 /*
  * ext4_punch_hole: punches a hole in a file by releaseing the blocks
  * associated with the given offset and length
@@ -3595,7 +3707,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
                        return ret;
        }
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        /* No need to punch hole beyond i_size */
        if (offset >= inode->i_size)
@@ -3623,17 +3735,26 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 
        }
 
+       /* Wait all existing dio workers, newcomers will block on i_mutex */
+       ext4_inode_block_unlocked_dio(inode);
+       inode_dio_wait(inode);
+
+       /*
+        * Prevent page faults from reinstantiating pages we have released from
+        * page cache.
+        */
+       down_write(&EXT4_I(inode)->i_mmap_sem);
        first_block_offset = round_up(offset, sb->s_blocksize);
        last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
 
        /* Now release the pages and zero block aligned part of pages*/
-       if (last_block_offset > first_block_offset)
+       if (last_block_offset > first_block_offset) {
+               ret = ext4_update_disksize_before_punch(inode, offset, length);
+               if (ret)
+                       goto out_dio;
                truncate_pagecache_range(inode, first_block_offset,
                                         last_block_offset);
-
-       /* Wait all existing dio workers, newcomers will block on i_mutex */
-       ext4_inode_block_unlocked_dio(inode);
-       inode_dio_wait(inode);
+       }
 
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                credits = ext4_writepage_trans_blocks(inode);
@@ -3680,19 +3801,15 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
        if (IS_SYNC(inode))
                ext4_handle_sync(handle);
 
-       /* Now release the pages again to reduce race window */
-       if (last_block_offset > first_block_offset)
-               truncate_pagecache_range(inode, first_block_offset,
-                                        last_block_offset);
-
        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
        ext4_mark_inode_dirty(handle, inode);
 out_stop:
        ext4_journal_stop(handle);
 out_dio:
+       up_write(&EXT4_I(inode)->i_mmap_sem);
        ext4_inode_resume_unlocked_dio(inode);
 out_mutex:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return ret;
 }
 
@@ -3762,7 +3879,7 @@ void ext4_truncate(struct inode *inode)
         * have i_mutex locked because it's not necessary.
         */
        if (!(inode->i_state & (I_NEW|I_FREEING)))
-               WARN_ON(!mutex_is_locked(&inode->i_mutex));
+               WARN_ON(!inode_is_locked(inode));
        trace_ext4_truncate_enter(inode);
 
        if (!ext4_can_truncate(inode))
@@ -4076,6 +4193,14 @@ static inline void ext4_iget_extra_inode(struct inode *inode,
                EXT4_I(inode)->i_inline_off = 0;
 }
 
+int ext4_get_projid(struct inode *inode, kprojid_t *projid)
+{
+       if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_RO_COMPAT_PROJECT))
+               return -EOPNOTSUPP;
+       *projid = EXT4_I(inode)->i_projid;
+       return 0;
+}
+
 struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 {
        struct ext4_iloc iloc;
@@ -4087,6 +4212,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        int block;
        uid_t i_uid;
        gid_t i_gid;
+       projid_t i_projid;
 
        inode = iget_locked(sb, ino);
        if (!inode)
@@ -4136,12 +4262,20 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        inode->i_mode = le16_to_cpu(raw_inode->i_mode);
        i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
        i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
+       if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) &&
+           EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+           EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
+               i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid);
+       else
+               i_projid = EXT4_DEF_PROJID;
+
        if (!(test_opt(inode->i_sb, NO_UID32))) {
                i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
                i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
        }
        i_uid_write(inode, i_uid);
        i_gid_write(inode, i_gid);
+       ei->i_projid = make_kprojid(&init_user_ns, i_projid);
        set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
 
        ext4_clear_state_flags(ei);     /* Only relevant on 32-bit archs */
@@ -4440,6 +4574,7 @@ static int ext4_do_update_inode(handle_t *handle,
        int need_datasync = 0, set_large_file = 0;
        uid_t i_uid;
        gid_t i_gid;
+       projid_t i_projid;
 
        spin_lock(&ei->i_raw_lock);
 
@@ -4452,6 +4587,7 @@ static int ext4_do_update_inode(handle_t *handle,
        raw_inode->i_mode = cpu_to_le16(inode->i_mode);
        i_uid = i_uid_read(inode);
        i_gid = i_gid_read(inode);
+       i_projid = from_kprojid(&init_user_ns, ei->i_projid);
        if (!(test_opt(inode->i_sb, NO_UID32))) {
                raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
                raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
@@ -4529,6 +4665,15 @@ static int ext4_do_update_inode(handle_t *handle,
                                cpu_to_le16(ei->i_extra_isize);
                }
        }
+
+       BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+                       EXT4_FEATURE_RO_COMPAT_PROJECT) &&
+              i_projid != EXT4_DEF_PROJID);
+
+       if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+           EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
+               raw_inode->i_projid = cpu_to_le32(i_projid);
+
        ext4_inode_csum_set(inode, raw_inode, ei);
        spin_unlock(&ei->i_raw_lock);
        if (inode->i_sb->s_flags & MS_LAZYTIME)
@@ -4824,6 +4969,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                        } else
                                ext4_wait_for_tail_page_commit(inode);
                }
+               down_write(&EXT4_I(inode)->i_mmap_sem);
                /*
                 * Truncate pagecache after we've waited for commit
                 * in data=journal mode to make pages freeable.
@@ -4831,6 +4977,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                truncate_pagecache(inode, inode->i_size);
                if (shrink)
                        ext4_truncate(inode);
+               up_write(&EXT4_I(inode)->i_mmap_sem);
        }
 
        if (!rc) {
@@ -5279,6 +5426,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 
        sb_start_pagefault(inode->i_sb);
        file_update_time(vma->vm_file);
+
+       down_read(&EXT4_I(inode)->i_mmap_sem);
        /* Delalloc case is easy... */
        if (test_opt(inode->i_sb, DELALLOC) &&
            !ext4_should_journal_data(inode) &&
@@ -5348,6 +5497,19 @@ retry_alloc:
 out_ret:
        ret = block_page_mkwrite_return(ret);
 out:
+       up_read(&EXT4_I(inode)->i_mmap_sem);
        sb_end_pagefault(inode->i_sb);
        return ret;
 }
+
+int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       struct inode *inode = file_inode(vma->vm_file);
+       int err;
+
+       down_read(&EXT4_I(inode)->i_mmap_sem);
+       err = filemap_fault(vma, vmf);
+       up_read(&EXT4_I(inode)->i_mmap_sem);
+
+       return err;
+}
index 5e872fd40e5e38655435fbcf91802244c9ce8959..0f6c36922c2466116d610e8092bd17b64285c29c 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/mount.h>
 #include <linux/file.h>
 #include <linux/random.h>
+#include <linux/quotaops.h>
 #include <asm/uaccess.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
@@ -202,6 +203,238 @@ static int uuid_is_zero(__u8 u[16])
        return 1;
 }
 
+static int ext4_ioctl_setflags(struct inode *inode,
+                              unsigned int flags)
+{
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       handle_t *handle = NULL;
+       int err = EPERM, migrate = 0;
+       struct ext4_iloc iloc;
+       unsigned int oldflags, mask, i;
+       unsigned int jflag;
+
+       /* Is it quota file? Do not allow user to mess with it */
+       if (IS_NOQUOTA(inode))
+               goto flags_out;
+
+       oldflags = ei->i_flags;
+
+       /* The JOURNAL_DATA flag is modifiable only by root */
+       jflag = flags & EXT4_JOURNAL_DATA_FL;
+
+       /*
+        * The IMMUTABLE and APPEND_ONLY flags can only be changed by
+        * the relevant capability.
+        *
+        * This test looks nicer. Thanks to Pauline Middelink
+        */
+       if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) {
+               if (!capable(CAP_LINUX_IMMUTABLE))
+                       goto flags_out;
+       }
+
+       /*
+        * The JOURNAL_DATA flag can only be changed by
+        * the relevant capability.
+        */
+       if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
+               if (!capable(CAP_SYS_RESOURCE))
+                       goto flags_out;
+       }
+       if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
+               migrate = 1;
+
+       if (flags & EXT4_EOFBLOCKS_FL) {
+               /* we don't support adding EOFBLOCKS flag */
+               if (!(oldflags & EXT4_EOFBLOCKS_FL)) {
+                       err = -EOPNOTSUPP;
+                       goto flags_out;
+               }
+       } else if (oldflags & EXT4_EOFBLOCKS_FL)
+               ext4_truncate(inode);
+
+       handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+       if (IS_ERR(handle)) {
+               err = PTR_ERR(handle);
+               goto flags_out;
+       }
+       if (IS_SYNC(inode))
+               ext4_handle_sync(handle);
+       err = ext4_reserve_inode_write(handle, inode, &iloc);
+       if (err)
+               goto flags_err;
+
+       for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
+               if (!(mask & EXT4_FL_USER_MODIFIABLE))
+                       continue;
+               if (mask & flags)
+                       ext4_set_inode_flag(inode, i);
+               else
+                       ext4_clear_inode_flag(inode, i);
+       }
+
+       ext4_set_inode_flags(inode);
+       inode->i_ctime = ext4_current_time(inode);
+
+       err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+flags_err:
+       ext4_journal_stop(handle);
+       if (err)
+               goto flags_out;
+
+       if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
+               err = ext4_change_inode_journal_flag(inode, jflag);
+       if (err)
+               goto flags_out;
+       if (migrate) {
+               if (flags & EXT4_EXTENTS_FL)
+                       err = ext4_ext_migrate(inode);
+               else
+                       err = ext4_ind_migrate(inode);
+       }
+
+flags_out:
+       return err;
+}
+
+#ifdef CONFIG_QUOTA
+static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
+{
+       struct inode *inode = file_inode(filp);
+       struct super_block *sb = inode->i_sb;
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       int err, rc;
+       handle_t *handle;
+       kprojid_t kprojid;
+       struct ext4_iloc iloc;
+       struct ext4_inode *raw_inode;
+
+       if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                       EXT4_FEATURE_RO_COMPAT_PROJECT)) {
+               if (projid != EXT4_DEF_PROJID)
+                       return -EOPNOTSUPP;
+               else
+                       return 0;
+       }
+
+       if (EXT4_INODE_SIZE(sb) <= EXT4_GOOD_OLD_INODE_SIZE)
+               return -EOPNOTSUPP;
+
+       kprojid = make_kprojid(&init_user_ns, (projid_t)projid);
+
+       if (projid_eq(kprojid, EXT4_I(inode)->i_projid))
+               return 0;
+
+       err = mnt_want_write_file(filp);
+       if (err)
+               return err;
+
+       err = -EPERM;
+       inode_lock(inode);
+       /* Is it quota file? Do not allow user to mess with it */
+       if (IS_NOQUOTA(inode))
+               goto out_unlock;
+
+       err = ext4_get_inode_loc(inode, &iloc);
+       if (err)
+               goto out_unlock;
+
+       raw_inode = ext4_raw_inode(&iloc);
+       if (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) {
+               err = -EOVERFLOW;
+               brelse(iloc.bh);
+               goto out_unlock;
+       }
+       brelse(iloc.bh);
+
+       dquot_initialize(inode);
+
+       handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
+               EXT4_QUOTA_INIT_BLOCKS(sb) +
+               EXT4_QUOTA_DEL_BLOCKS(sb) + 3);
+       if (IS_ERR(handle)) {
+               err = PTR_ERR(handle);
+               goto out_unlock;
+       }
+
+       err = ext4_reserve_inode_write(handle, inode, &iloc);
+       if (err)
+               goto out_stop;
+
+       if (sb_has_quota_limits_enabled(sb, PRJQUOTA)) {
+               struct dquot *transfer_to[MAXQUOTAS] = { };
+
+               transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid));
+               if (transfer_to[PRJQUOTA]) {
+                       err = __dquot_transfer(inode, transfer_to);
+                       dqput(transfer_to[PRJQUOTA]);
+                       if (err)
+                               goto out_dirty;
+               }
+       }
+       EXT4_I(inode)->i_projid = kprojid;
+       inode->i_ctime = ext4_current_time(inode);
+out_dirty:
+       rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
+       if (!err)
+               err = rc;
+out_stop:
+       ext4_journal_stop(handle);
+out_unlock:
+       inode_unlock(inode);
+       mnt_drop_write_file(filp);
+       return err;
+}
+#else
+static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
+{
+       if (projid != EXT4_DEF_PROJID)
+               return -EOPNOTSUPP;
+       return 0;
+}
+#endif
+
+/* Transfer internal flags to xflags */
+static inline __u32 ext4_iflags_to_xflags(unsigned long iflags)
+{
+       __u32 xflags = 0;
+
+       if (iflags & EXT4_SYNC_FL)
+               xflags |= FS_XFLAG_SYNC;
+       if (iflags & EXT4_IMMUTABLE_FL)
+               xflags |= FS_XFLAG_IMMUTABLE;
+       if (iflags & EXT4_APPEND_FL)
+               xflags |= FS_XFLAG_APPEND;
+       if (iflags & EXT4_NODUMP_FL)
+               xflags |= FS_XFLAG_NODUMP;
+       if (iflags & EXT4_NOATIME_FL)
+               xflags |= FS_XFLAG_NOATIME;
+       if (iflags & EXT4_PROJINHERIT_FL)
+               xflags |= FS_XFLAG_PROJINHERIT;
+       return xflags;
+}
+
+/* Transfer xflags flags to internal */
+static inline unsigned long ext4_xflags_to_iflags(__u32 xflags)
+{
+       unsigned long iflags = 0;
+
+       if (xflags & FS_XFLAG_SYNC)
+               iflags |= EXT4_SYNC_FL;
+       if (xflags & FS_XFLAG_IMMUTABLE)
+               iflags |= EXT4_IMMUTABLE_FL;
+       if (xflags & FS_XFLAG_APPEND)
+               iflags |= EXT4_APPEND_FL;
+       if (xflags & FS_XFLAG_NODUMP)
+               iflags |= EXT4_NODUMP_FL;
+       if (xflags & FS_XFLAG_NOATIME)
+               iflags |= EXT4_NOATIME_FL;
+       if (xflags & FS_XFLAG_PROJINHERIT)
+               iflags |= EXT4_PROJINHERIT_FL;
+
+       return iflags;
+}
+
 long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
        struct inode *inode = file_inode(filp);
@@ -217,11 +450,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
                return put_user(flags, (int __user *) arg);
        case EXT4_IOC_SETFLAGS: {
-               handle_t *handle = NULL;
-               int err, migrate = 0;
-               struct ext4_iloc iloc;
-               unsigned int oldflags, mask, i;
-               unsigned int jflag;
+               int err;
 
                if (!inode_owner_or_capable(inode))
                        return -EACCES;
@@ -235,90 +464,9 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 
                flags = ext4_mask_flags(inode->i_mode, flags);
 
-               err = -EPERM;
-               mutex_lock(&inode->i_mutex);
-               /* Is it quota file? Do not allow user to mess with it */
-               if (IS_NOQUOTA(inode))
-                       goto flags_out;
-
-               oldflags = ei->i_flags;
-
-               /* The JOURNAL_DATA flag is modifiable only by root */
-               jflag = flags & EXT4_JOURNAL_DATA_FL;
-
-               /*
-                * The IMMUTABLE and APPEND_ONLY flags can only be changed by
-                * the relevant capability.
-                *
-                * This test looks nicer. Thanks to Pauline Middelink
-                */
-               if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) {
-                       if (!capable(CAP_LINUX_IMMUTABLE))
-                               goto flags_out;
-               }
-
-               /*
-                * The JOURNAL_DATA flag can only be changed by
-                * the relevant capability.
-                */
-               if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
-                       if (!capable(CAP_SYS_RESOURCE))
-                               goto flags_out;
-               }
-               if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
-                       migrate = 1;
-
-               if (flags & EXT4_EOFBLOCKS_FL) {
-                       /* we don't support adding EOFBLOCKS flag */
-                       if (!(oldflags & EXT4_EOFBLOCKS_FL)) {
-                               err = -EOPNOTSUPP;
-                               goto flags_out;
-                       }
-               } else if (oldflags & EXT4_EOFBLOCKS_FL)
-                       ext4_truncate(inode);
-
-               handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
-               if (IS_ERR(handle)) {
-                       err = PTR_ERR(handle);
-                       goto flags_out;
-               }
-               if (IS_SYNC(inode))
-                       ext4_handle_sync(handle);
-               err = ext4_reserve_inode_write(handle, inode, &iloc);
-               if (err)
-                       goto flags_err;
-
-               for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
-                       if (!(mask & EXT4_FL_USER_MODIFIABLE))
-                               continue;
-                       if (mask & flags)
-                               ext4_set_inode_flag(inode, i);
-                       else
-                               ext4_clear_inode_flag(inode, i);
-               }
-
-               ext4_set_inode_flags(inode);
-               inode->i_ctime = ext4_current_time(inode);
-
-               err = ext4_mark_iloc_dirty(handle, inode, &iloc);
-flags_err:
-               ext4_journal_stop(handle);
-               if (err)
-                       goto flags_out;
-
-               if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
-                       err = ext4_change_inode_journal_flag(inode, jflag);
-               if (err)
-                       goto flags_out;
-               if (migrate) {
-                       if (flags & EXT4_EXTENTS_FL)
-                               err = ext4_ext_migrate(inode);
-                       else
-                               err = ext4_ind_migrate(inode);
-               }
-
-flags_out:
-               mutex_unlock(&inode->i_mutex);
+               inode_lock(inode);
+               err = ext4_ioctl_setflags(inode, flags);
+               inode_unlock(inode);
                mnt_drop_write_file(filp);
                return err;
        }
@@ -349,7 +497,7 @@ flags_out:
                        goto setversion_out;
                }
 
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
                handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
                if (IS_ERR(handle)) {
                        err = PTR_ERR(handle);
@@ -364,7 +512,7 @@ flags_out:
                ext4_journal_stop(handle);
 
 unlock_out:
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
 setversion_out:
                mnt_drop_write_file(filp);
                return err;
@@ -510,9 +658,9 @@ group_add_out:
                 * ext4_ext_swap_inode_data before we switch the
                 * inode format to prevent read.
                 */
-               mutex_lock(&(inode->i_mutex));
+               inode_lock((inode));
                err = ext4_ext_migrate(inode);
-               mutex_unlock(&(inode->i_mutex));
+               inode_unlock((inode));
                mnt_drop_write_file(filp);
                return err;
        }
@@ -689,6 +837,60 @@ encryption_policy_out:
                return -EOPNOTSUPP;
 #endif
        }
+       case EXT4_IOC_FSGETXATTR:
+       {
+               struct fsxattr fa;
+
+               memset(&fa, 0, sizeof(struct fsxattr));
+               ext4_get_inode_flags(ei);
+               fa.fsx_xflags = ext4_iflags_to_xflags(ei->i_flags & EXT4_FL_USER_VISIBLE);
+
+               if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+                               EXT4_FEATURE_RO_COMPAT_PROJECT)) {
+                       fa.fsx_projid = (__u32)from_kprojid(&init_user_ns,
+                               EXT4_I(inode)->i_projid);
+               }
+
+               if (copy_to_user((struct fsxattr __user *)arg,
+                                &fa, sizeof(fa)))
+                       return -EFAULT;
+               return 0;
+       }
+       case EXT4_IOC_FSSETXATTR:
+       {
+               struct fsxattr fa;
+               int err;
+
+               if (copy_from_user(&fa, (struct fsxattr __user *)arg,
+                                  sizeof(fa)))
+                       return -EFAULT;
+
+               /* Make sure caller has proper permission */
+               if (!inode_owner_or_capable(inode))
+                       return -EACCES;
+
+               err = mnt_want_write_file(filp);
+               if (err)
+                       return err;
+
+               flags = ext4_xflags_to_iflags(fa.fsx_xflags);
+               flags = ext4_mask_flags(inode->i_mode, flags);
+
+               inode_lock(inode);
+               flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) |
+                        (flags & EXT4_FL_XFLAG_VISIBLE);
+               err = ext4_ioctl_setflags(inode, flags);
+               inode_unlock(inode);
+               mnt_drop_write_file(filp);
+               if (err)
+                       return err;
+
+               err = ext4_ioctl_setproject(filp, fa.fsx_projid);
+               if (err)
+                       return err;
+
+               return 0;
+       }
        default:
                return -ENOTTY;
        }
index f27e0c2598c59edb5685c27310c2766d0860838d..06574dd77614a3b1c4396d5bb3c017c681209d00 100644 (file)
@@ -273,7 +273,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
                struct ext4_filename *fname,
                struct ext4_dir_entry_2 **res_dir);
 static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
-                            struct dentry *dentry, struct inode *inode);
+                            struct inode *dir, struct inode *inode);
 
 /* checksumming functions */
 void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
@@ -1928,10 +1928,9 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
  * directory, and adds the dentry to the indexed directory.
  */
 static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
-                           struct dentry *dentry,
+                           struct inode *dir,
                            struct inode *inode, struct buffer_head *bh)
 {
-       struct inode    *dir = d_inode(dentry->d_parent);
        struct buffer_head *bh2;
        struct dx_root  *root;
        struct dx_frame frames[2], *frame;
@@ -2086,8 +2085,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
                return retval;
 
        if (ext4_has_inline_data(dir)) {
-               retval = ext4_try_add_inline_entry(handle, &fname,
-                                                  dentry, inode);
+               retval = ext4_try_add_inline_entry(handle, &fname, dir, inode);
                if (retval < 0)
                        goto out;
                if (retval == 1) {
@@ -2097,7 +2095,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
        }
 
        if (is_dx(dir)) {
-               retval = ext4_dx_add_entry(handle, &fname, dentry, inode);
+               retval = ext4_dx_add_entry(handle, &fname, dir, inode);
                if (!retval || (retval != ERR_BAD_DX_DIR))
                        goto out;
                ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
@@ -2119,7 +2117,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 
                if (blocks == 1 && !dx_fallback &&
                    ext4_has_feature_dir_index(sb)) {
-                       retval = make_indexed_dir(handle, &fname, dentry,
+                       retval = make_indexed_dir(handle, &fname, dir,
                                                  inode, bh);
                        bh = NULL; /* make_indexed_dir releases bh */
                        goto out;
@@ -2154,12 +2152,11 @@ out:
  * Returns 0 for success, or a negative error value
  */
 static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
-                            struct dentry *dentry, struct inode *inode)
+                            struct inode *dir, struct inode *inode)
 {
        struct dx_frame frames[2], *frame;
        struct dx_entry *entries, *at;
        struct buffer_head *bh;
-       struct inode *dir = d_inode(dentry->d_parent);
        struct super_block *sb = dir->i_sb;
        struct ext4_dir_entry_2 *de;
        int err;
@@ -2756,7 +2753,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
                return 0;
 
        WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
-                    !mutex_is_locked(&inode->i_mutex));
+                    !inode_is_locked(inode));
        /*
         * Exit early if inode already is on orphan list. This is a big speedup
         * since we don't have to contend on the global s_orphan_lock.
@@ -2838,7 +2835,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
                return 0;
 
        WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
-                    !mutex_is_locked(&inode->i_mutex));
+                    !inode_is_locked(inode));
        /* Do this quick check before taking global s_orphan_lock. */
        if (list_empty(&ei->i_orphan))
                return 0;
@@ -3212,6 +3209,12 @@ static int ext4_link(struct dentry *old_dentry,
        if (ext4_encrypted_inode(dir) &&
            !ext4_is_child_context_consistent_with_parent(dir, inode))
                return -EPERM;
+
+       if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) &&
+          (!projid_eq(EXT4_I(dir)->i_projid,
+                      EXT4_I(old_dentry->d_inode)->i_projid)))
+               return -EXDEV;
+
        err = dquot_initialize(dir);
        if (err)
                return err;
@@ -3492,6 +3495,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
        int credits;
        u8 old_file_type;
 
+       if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT)) &&
+           (!projid_eq(EXT4_I(new_dir)->i_projid,
+                       EXT4_I(old_dentry->d_inode)->i_projid)))
+               return -EXDEV;
+
        retval = dquot_initialize(old.dir);
        if (retval)
                return retval;
@@ -3701,6 +3709,14 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
                                                           new.inode)))
                return -EPERM;
 
+       if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) &&
+            !projid_eq(EXT4_I(new_dir)->i_projid,
+                       EXT4_I(old_dentry->d_inode)->i_projid)) ||
+           (ext4_test_inode_flag(old_dir, EXT4_INODE_PROJINHERIT) &&
+            !projid_eq(EXT4_I(old_dir)->i_projid,
+                       EXT4_I(new_dentry->d_inode)->i_projid)))
+               return -EXDEV;
+
        retval = dquot_initialize(old.dir);
        if (retval)
                return retval;
index f1b56ff0120894e3e4cc79886928ea7852ac5e25..3ed01ec011d75067579a4b894b4b0623af265a2d 100644 (file)
@@ -80,6 +80,36 @@ static void ext4_destroy_lazyinit_thread(void);
 static void ext4_unregister_li_request(struct super_block *sb);
 static void ext4_clear_request_list(void);
 
+/*
+ * Lock ordering
+ *
+ * Note the difference between i_mmap_sem (EXT4_I(inode)->i_mmap_sem) and
+ * i_mmap_rwsem (inode->i_mmap_rwsem)!
+ *
+ * page fault path:
+ * mmap_sem -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start ->
+ *   page lock -> i_data_sem (rw)
+ *
+ * buffered write path:
+ * sb_start_write -> i_mutex -> mmap_sem
+ * sb_start_write -> i_mutex -> transaction start -> page lock ->
+ *   i_data_sem (rw)
+ *
+ * truncate:
+ * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (w) -> i_mmap_sem (w) ->
+ *   i_mmap_rwsem (w) -> page lock
+ * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (w) -> i_mmap_sem (w) ->
+ *   transaction start -> i_data_sem (rw)
+ *
+ * direct IO:
+ * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (r) -> mmap_sem
+ * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (r) ->
+ *   transaction start -> i_data_sem (rw)
+ *
+ * writepages:
+ * transaction start -> page lock(s) -> i_data_sem (rw)
+ */
+
 #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
 static struct file_system_type ext2_fs_type = {
        .owner          = THIS_MODULE,
@@ -958,6 +988,7 @@ static void init_once(void *foo)
        INIT_LIST_HEAD(&ei->i_orphan);
        init_rwsem(&ei->xattr_sem);
        init_rwsem(&ei->i_data_sem);
+       init_rwsem(&ei->i_mmap_sem);
        inode_init_once(&ei->vfs_inode);
 }
 
@@ -1066,8 +1097,8 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
 }
 
 #ifdef CONFIG_QUOTA
-#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
-#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
+static char *quotatypes[] = INITQFNAMES;
+#define QTYPE2NAME(t) (quotatypes[t])
 
 static int ext4_write_dquot(struct dquot *dquot);
 static int ext4_acquire_dquot(struct dquot *dquot);
@@ -1100,6 +1131,7 @@ static const struct dquot_operations ext4_quota_operations = {
        .write_info     = ext4_write_info,
        .alloc_dquot    = dquot_alloc,
        .destroy_dquot  = dquot_destroy,
+       .get_projid     = ext4_get_projid,
 };
 
 static const struct quotactl_ops ext4_qctl_operations = {
@@ -2254,10 +2286,10 @@ static void ext4_orphan_cleanup(struct super_block *sb,
                                        __func__, inode->i_ino, inode->i_size);
                        jbd_debug(2, "truncating inode %lu to %lld bytes\n",
                                  inode->i_ino, inode->i_size);
-                       mutex_lock(&inode->i_mutex);
+                       inode_lock(inode);
                        truncate_inode_pages(inode->i_mapping, inode->i_size);
                        ext4_truncate(inode);
-                       mutex_unlock(&inode->i_mutex);
+                       inode_unlock(inode);
                        nr_truncates++;
                } else {
                        if (test_opt(sb, DEBUG))
@@ -2526,6 +2558,12 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
                         "without CONFIG_QUOTA");
                return 0;
        }
+       if (ext4_has_feature_project(sb) && !readonly) {
+               ext4_msg(sb, KERN_ERR,
+                        "Filesystem with project quota feature cannot be mounted RDWR "
+                        "without CONFIG_QUOTA");
+               return 0;
+       }
 #endif  /* CONFIG_QUOTA */
        return 1;
 }
@@ -3654,7 +3692,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                sb->s_qcop = &dquot_quotactl_sysfile_ops;
        else
                sb->s_qcop = &ext4_qctl_operations;
-       sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
+       sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
 #endif
        memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
 
@@ -4790,6 +4828,48 @@ restore_opts:
        return err;
 }
 
+#ifdef CONFIG_QUOTA
+static int ext4_statfs_project(struct super_block *sb,
+                              kprojid_t projid, struct kstatfs *buf)
+{
+       struct kqid qid;
+       struct dquot *dquot;
+       u64 limit;
+       u64 curblock;
+
+       qid = make_kqid_projid(projid);
+       dquot = dqget(sb, qid);
+       if (IS_ERR(dquot))
+               return PTR_ERR(dquot);
+       spin_lock(&dq_data_lock);
+
+       limit = (dquot->dq_dqb.dqb_bsoftlimit ?
+                dquot->dq_dqb.dqb_bsoftlimit :
+                dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits;
+       if (limit && buf->f_blocks > limit) {
+               curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits;
+               buf->f_blocks = limit;
+               buf->f_bfree = buf->f_bavail =
+                       (buf->f_blocks > curblock) ?
+                        (buf->f_blocks - curblock) : 0;
+       }
+
+       limit = dquot->dq_dqb.dqb_isoftlimit ?
+               dquot->dq_dqb.dqb_isoftlimit :
+               dquot->dq_dqb.dqb_ihardlimit;
+       if (limit && buf->f_files > limit) {
+               buf->f_files = limit;
+               buf->f_ffree =
+                       (buf->f_files > dquot->dq_dqb.dqb_curinodes) ?
+                        (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0;
+       }
+
+       spin_unlock(&dq_data_lock);
+       dqput(dquot);
+       return 0;
+}
+#endif
+
 static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
@@ -4822,6 +4902,11 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
        buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
 
+#ifdef CONFIG_QUOTA
+       if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) &&
+           sb_has_quota_limits_enabled(sb, PRJQUOTA))
+               ext4_statfs_project(sb, EXT4_I(dentry->d_inode)->i_projid, buf);
+#endif
        return 0;
 }
 
@@ -4986,7 +5071,8 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
        struct inode *qf_inode;
        unsigned long qf_inums[EXT4_MAXQUOTAS] = {
                le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
-               le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
+               le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
+               le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
        };
 
        BUG_ON(!ext4_has_feature_quota(sb));
@@ -5014,7 +5100,8 @@ static int ext4_enable_quotas(struct super_block *sb)
        int type, err = 0;
        unsigned long qf_inums[EXT4_MAXQUOTAS] = {
                le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
-               le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
+               le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
+               le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
        };
 
        sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
index 011ba6670d990285f6f61b56ea3fb8b421b70e59..c70d06a383e28819cc556f1027ea118272e5f738 100644 (file)
  */
 static inline void ext4_truncate_failed_write(struct inode *inode)
 {
+       down_write(&EXT4_I(inode)->i_mmap_sem);
        truncate_inode_pages(inode->i_mapping, inode->i_size);
        ext4_truncate(inode);
+       up_write(&EXT4_I(inode)->i_mmap_sem);
 }
 
 /*
index ac9e7c6aac74df601ddf3043c7ce364ad0cf2294..5c06db17e41fa267f5b270061d2959b2a36803e4 100644 (file)
@@ -794,7 +794,7 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        return ret;
        }
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        isize = i_size_read(inode);
        if (start >= isize)
@@ -860,7 +860,7 @@ out:
        if (ret == 1)
                ret = 0;
 
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return ret;
 }
 
index 18ddb1e5182a16e016224d449c6eb281c582b00e..ea272be62677004d29c8018691ea442c2c992bb1 100644 (file)
@@ -333,7 +333,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
        loff_t isize;
        int err = 0;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        isize = i_size_read(inode);
        if (offset >= isize)
@@ -388,10 +388,10 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
 found:
        if (whence == SEEK_HOLE && data_ofs > isize)
                data_ofs = isize;
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return vfs_setpos(file, data_ofs, maxbytes);
 fail:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return -ENXIO;
 }
 
@@ -1219,7 +1219,7 @@ static long f2fs_fallocate(struct file *file, int mode,
                        FALLOC_FL_INSERT_RANGE))
                return -EOPNOTSUPP;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        if (mode & FALLOC_FL_PUNCH_HOLE) {
                if (offset >= inode->i_size)
@@ -1243,7 +1243,7 @@ static long f2fs_fallocate(struct file *file, int mode,
        }
 
 out:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        trace_f2fs_fallocate(inode, mode, offset, len, ret);
        return ret;
@@ -1307,13 +1307,13 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
 
        flags = f2fs_mask_flags(inode->i_mode, flags);
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        oldflags = fi->i_flags;
 
        if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
                if (!capable(CAP_LINUX_IMMUTABLE)) {
-                       mutex_unlock(&inode->i_mutex);
+                       inode_unlock(inode);
                        ret = -EPERM;
                        goto out;
                }
@@ -1322,7 +1322,7 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
        flags = flags & FS_FL_USER_MODIFIABLE;
        flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
        fi->i_flags = flags;
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        f2fs_set_inode_flags(inode);
        inode->i_ctime = CURRENT_TIME;
@@ -1667,7 +1667,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
 
        f2fs_balance_fs(sbi, true);
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        /* writeback all dirty pages in the range */
        err = filemap_write_and_wait_range(inode->i_mapping, range->start,
@@ -1778,7 +1778,7 @@ do_map:
 clear_out:
        clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG);
 out:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        if (!err)
                range->len = (u64)total << PAGE_CACHE_SHIFT;
        return err;
index 7def96caec5f7fc4abd82e51746bce4dce1f30ee..d0b95c95079bbdc085a9746226427ceb3805bc45 100644 (file)
@@ -769,7 +769,7 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *file,
 
        buf.dirent = dirent;
        buf.result = 0;
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        buf.ctx.pos = file->f_pos;
        ret = -ENOENT;
        if (!IS_DEADDIR(inode)) {
@@ -777,7 +777,7 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *file,
                                    short_only, both ? &buf : NULL);
                file->f_pos = buf.ctx.pos;
        }
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        if (ret >= 0)
                ret = buf.result;
        return ret;
index 43d3475da83a79c8857e0861bf298df38a9d7ba7..f7018566883241bc65a981cf0d6e533e4d331241 100644 (file)
@@ -24,9 +24,9 @@ static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr)
 {
        u32 attr;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        attr = fat_make_attrs(inode);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        return put_user(attr, user_attr);
 }
@@ -47,7 +47,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
        err = mnt_want_write_file(file);
        if (err)
                goto out;
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        /*
         * ATTR_VOLUME and ATTR_DIR cannot be changed; this also
@@ -109,7 +109,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
        fat_save_attrs(inode, attr);
        mark_inode_dirty(inode);
 out_unlock_inode:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        mnt_drop_write_file(file);
 out:
        return err;
@@ -246,7 +246,7 @@ static long fat_fallocate(struct file *file, int mode,
        if (!S_ISREG(inode->i_mode))
                return -EOPNOTSUPP;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        if (mode & FALLOC_FL_KEEP_SIZE) {
                ondisksize = inode->i_blocks << 9;
                if ((offset + len) <= ondisksize)
@@ -272,7 +272,7 @@ static long fat_fallocate(struct file *file, int mode,
        }
 
 error:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return err;
 }
 
index 5797d45a78cb29fddcff617a8fc03139ed609369..c5618db110be454781d6c7b8a70f8284fa00a477 100644 (file)
@@ -46,9 +46,9 @@ void put_filesystem(struct file_system_type *fs)
 static struct file_system_type **find_filesystem(const char *name, unsigned len)
 {
        struct file_system_type **p;
-       for (p=&file_systems; *p; p=&(*p)->next)
-               if (strlen((*p)->name) == len &&
-                   strncmp((*p)->name, name, len) == 0)
+       for (p = &file_systems; *p; p = &(*p)->next)
+               if (strncmp((*p)->name, name, len) == 0 &&
+                   !(*p)->name[len])
                        break;
        return p;
 }
index 712601f299b8a1436d69f0a31362ab57032f24b7..4b855b65d4577a2fd15389e22ddb40b2993f2e7a 100644 (file)
@@ -944,7 +944,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
        if (!parent)
                return -ENOENT;
 
-       mutex_lock(&parent->i_mutex);
+       inode_lock(parent);
        if (!S_ISDIR(parent->i_mode))
                goto unlock;
 
@@ -962,7 +962,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
        fuse_invalidate_entry(entry);
 
        if (child_nodeid != 0 && d_really_is_positive(entry)) {
-               mutex_lock(&d_inode(entry)->i_mutex);
+               inode_lock(d_inode(entry));
                if (get_node_id(d_inode(entry)) != child_nodeid) {
                        err = -ENOENT;
                        goto badentry;
@@ -983,7 +983,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
                clear_nlink(d_inode(entry));
                err = 0;
  badentry:
-               mutex_unlock(&d_inode(entry)->i_mutex);
+               inode_unlock(d_inode(entry));
                if (!err)
                        d_delete(entry);
        } else {
@@ -992,7 +992,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
        dput(entry);
 
  unlock:
-       mutex_unlock(&parent->i_mutex);
+       inode_unlock(parent);
        iput(parent);
        return err;
 }
@@ -1504,7 +1504,7 @@ void fuse_set_nowrite(struct inode *inode)
        struct fuse_conn *fc = get_fuse_conn(inode);
        struct fuse_inode *fi = get_fuse_inode(inode);
 
-       BUG_ON(!mutex_is_locked(&inode->i_mutex));
+       BUG_ON(!inode_is_locked(inode));
 
        spin_lock(&fc->lock);
        BUG_ON(fi->writectr < 0);
index aa03aab6a24f085b210e39b9d093c84c76235c28..b03d253ece159c4340765e46993127a53354b21b 100644 (file)
@@ -207,7 +207,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
                return err;
 
        if (lock_inode)
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
 
        err = fuse_do_open(fc, get_node_id(inode), file, isdir);
 
@@ -215,7 +215,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
                fuse_finish_open(inode, file);
 
        if (lock_inode)
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
 
        return err;
 }
@@ -413,9 +413,9 @@ static int fuse_flush(struct file *file, fl_owner_t id)
        if (err)
                return err;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        fuse_sync_writes(inode);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        req = fuse_get_req_nofail_nopages(fc, file);
        memset(&inarg, 0, sizeof(inarg));
@@ -450,7 +450,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
        if (is_bad_inode(inode))
                return -EIO;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        /*
         * Start writeback against all dirty pages of the inode, then
@@ -486,7 +486,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
                err = 0;
        }
 out:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return err;
 }
 
@@ -1160,7 +1160,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
                return generic_file_write_iter(iocb, from);
        }
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        /* We can write back this queue in page reclaim */
        current->backing_dev_info = inode_to_bdi(inode);
@@ -1210,7 +1210,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
        }
 out:
        current->backing_dev_info = NULL;
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        return written ? written : err;
 }
@@ -1322,10 +1322,10 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
 
        if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) {
                if (!write)
-                       mutex_lock(&inode->i_mutex);
+                       inode_lock(inode);
                fuse_sync_writes(inode);
                if (!write)
-                       mutex_unlock(&inode->i_mutex);
+                       inode_unlock(inode);
        }
 
        while (count) {
@@ -1413,14 +1413,14 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
                return -EIO;
 
        /* Don't allow parallel writes to the same file */
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        res = generic_write_checks(iocb, from);
        if (res > 0)
                res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE);
        fuse_invalidate_attr(inode);
        if (res > 0)
                fuse_write_update_size(inode, iocb->ki_pos);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        return res;
 }
@@ -2287,17 +2287,17 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
                retval = generic_file_llseek(file, offset, whence);
                break;
        case SEEK_END:
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
                retval = fuse_update_attributes(inode, NULL, file, NULL);
                if (!retval)
                        retval = generic_file_llseek(file, offset, whence);
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
                break;
        case SEEK_HOLE:
        case SEEK_DATA:
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
                retval = fuse_lseek(file, offset, whence);
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
                break;
        default:
                retval = -EINVAL;
@@ -2944,7 +2944,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
                return -EOPNOTSUPP;
 
        if (lock_inode) {
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
                if (mode & FALLOC_FL_PUNCH_HOLE) {
                        loff_t endbyte = offset + length - 1;
                        err = filemap_write_and_wait_range(inode->i_mapping,
@@ -2990,7 +2990,7 @@ out:
                clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
 
        if (lock_inode)
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
 
        return err;
 }
index 7412863cda1e52736552ef0deea6aa84ff8c844f..c9384f932975efb8075d2fd2013116e8be9f055a 100644 (file)
@@ -914,7 +914,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t le
        if ((mode & ~FALLOC_FL_KEEP_SIZE) || gfs2_is_jdata(ip))
                return -EOPNOTSUPP;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
        ret = gfs2_glock_nq(&gh);
@@ -946,7 +946,7 @@ out_unlock:
        gfs2_glock_dq(&gh);
 out_uninit:
        gfs2_holder_uninit(&gh);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return ret;
 }
 
index 3e94400d587c4e1efdea6afee130b7b35c0a16f5..352f958769e15b150575c54b09d82b51d34f258e 100644 (file)
@@ -2067,7 +2067,7 @@ static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        if (ret)
                return ret;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
        if (ret)
@@ -2094,7 +2094,7 @@ static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 
        gfs2_glock_dq_uninit(&gh);
 out:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return ret;
 }
 
index be6d9c450b22d212cfcdfa8d42e07f76705347e2..a398913442591892f6b7e24e14d2b1a8e30b7394 100644 (file)
@@ -888,7 +888,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
                return -ENOMEM;
 
        sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL);
-       mutex_lock(&ip->i_inode.i_mutex);
+       inode_lock(&ip->i_inode);
        for (qx = 0; qx < num_qd; qx++) {
                error = gfs2_glock_nq_init(qda[qx]->qd_gl, LM_ST_EXCLUSIVE,
                                           GL_NOCACHE, &ghs[qx]);
@@ -953,7 +953,7 @@ out_alloc:
 out:
        while (qx--)
                gfs2_glock_dq_uninit(&ghs[qx]);
-       mutex_unlock(&ip->i_inode.i_mutex);
+       inode_unlock(&ip->i_inode);
        kfree(ghs);
        gfs2_log_flush(ip->i_gl->gl_name.ln_sbd, ip->i_gl, NORMAL_FLUSH);
        return error;
@@ -1674,7 +1674,7 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
        if (error)
                goto out_put;
 
-       mutex_lock(&ip->i_inode.i_mutex);
+       inode_lock(&ip->i_inode);
        error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE, 0, &q_gh);
        if (error)
                goto out_unlockput;
@@ -1739,7 +1739,7 @@ out_i:
 out_q:
        gfs2_glock_dq_uninit(&q_gh);
 out_unlockput:
-       mutex_unlock(&ip->i_inode.i_mutex);
+       inode_unlock(&ip->i_inode);
 out_put:
        qd_put(qd);
        return error;
index 70788e03820ae3e1c34689c95493798c20781928..e9f2b855f83160d3798e4ef62f6d86a7f0e69547 100644 (file)
@@ -173,9 +173,9 @@ static int hfs_dir_release(struct inode *inode, struct file *file)
 {
        struct hfs_readdir_data *rd = file->private_data;
        if (rd) {
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
                list_del(&rd->list);
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
                kfree(rd);
        }
        return 0;
index b99ebddb10cb5b5e2d422c0f2151fc926455d26b..6686bf39a5b5a0b4bb83de60527cd873b612fc35 100644 (file)
@@ -570,13 +570,13 @@ static int hfs_file_release(struct inode *inode, struct file *file)
        if (HFS_IS_RSRC(inode))
                inode = HFS_I(inode)->rsrc_inode;
        if (atomic_dec_and_test(&HFS_I(inode)->opencnt)) {
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
                hfs_file_truncate(inode);
                //if (inode->i_flags & S_DEAD) {
                //      hfs_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL);
                //      hfs_delete_inode(inode);
                //}
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
        }
        return 0;
 }
@@ -656,7 +656,7 @@ static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end,
        ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
        if (ret)
                return ret;
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        /* sync the inode to buffers */
        ret = write_inode_now(inode, 0);
@@ -668,7 +668,7 @@ static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end,
        err = sync_blockdev(sb->s_bdev);
        if (!ret)
                ret = err;
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return ret;
 }
 
index d0f39dcbb58e86057c91c2e32a9d538178da39c5..a4e867e089478ddd3edaf24ce9be7d485d66ad34 100644 (file)
@@ -284,9 +284,9 @@ static int hfsplus_dir_release(struct inode *inode, struct file *file)
 {
        struct hfsplus_readdir_data *rd = file->private_data;
        if (rd) {
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
                list_del(&rd->list);
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
                kfree(rd);
        }
        return 0;
index 19b33f8151f1ab2cb0957b7e5772ff1206659aad..1a6394cdb54ef59520dbcd7f7577aee5d93d2561 100644 (file)
@@ -229,14 +229,14 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
        if (HFSPLUS_IS_RSRC(inode))
                inode = HFSPLUS_I(inode)->rsrc_inode;
        if (atomic_dec_and_test(&HFSPLUS_I(inode)->opencnt)) {
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
                hfsplus_file_truncate(inode);
                if (inode->i_flags & S_DEAD) {
                        hfsplus_delete_cat(inode->i_ino,
                                           HFSPLUS_SB(sb)->hidden_dir, NULL);
                        hfsplus_delete_inode(inode);
                }
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
        }
        return 0;
 }
@@ -286,7 +286,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
        error = filemap_write_and_wait_range(inode->i_mapping, start, end);
        if (error)
                return error;
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        /*
         * Sync inode metadata into the catalog and extent trees.
@@ -327,7 +327,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
        if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
 
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        return error;
 }
index 0624ce4e07022e3f047683a70cbbf1cdad6ff174..32a49e292b6a6b5d94421eaa8eb59541d15b12b5 100644 (file)
@@ -93,7 +93,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
                goto out_drop_write;
        }
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        if ((flags & (FS_IMMUTABLE_FL|FS_APPEND_FL)) ||
            inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
@@ -126,7 +126,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
        mark_inode_dirty(inode);
 
 out_unlock_inode:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 out_drop_write:
        mnt_drop_write_file(file);
 out:
index cfaa18c7a33796e0ee330d7b67e4b98bb6a95072..d1abbee281d19d8f51f1417bdb56f2adebc178f0 100644 (file)
@@ -378,9 +378,9 @@ static int hostfs_fsync(struct file *file, loff_t start, loff_t end,
        if (ret)
                return ret;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        ret = fsync_file(HOSTFS_I(inode)->fd, datasync);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        return ret;
 }
index dc540bfcee1d8e7842b6b0f0ca402214f9bf5609..e57a53c13d864a74431ee4a98653df083009a317 100644 (file)
@@ -33,7 +33,7 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
        if (whence == SEEK_DATA || whence == SEEK_HOLE)
                return -EINVAL;
 
-       mutex_lock(&i->i_mutex);
+       inode_lock(i);
        hpfs_lock(s);
 
        /*pr_info("dir lseek\n");*/
@@ -48,12 +48,12 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
 ok:
        filp->f_pos = new_off;
        hpfs_unlock(s);
-       mutex_unlock(&i->i_mutex);
+       inode_unlock(i);
        return new_off;
 fail:
        /*pr_warn("illegal lseek: %016llx\n", new_off);*/
        hpfs_unlock(s);
-       mutex_unlock(&i->i_mutex);
+       inode_unlock(i);
        return -ESPIPE;
 }
 
index 8bbf7f3e2a27e0669e7f7fd5624a2cb5b50ce0fe..e1f465a389d5be1b27f8fd98451312bb94b454b8 100644 (file)
@@ -141,7 +141,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 
        vma_len = (loff_t)(vma->vm_end - vma->vm_start);
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        file_accessed(file);
 
        ret = -ENOMEM;
@@ -157,7 +157,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
        if (vma->vm_flags & VM_WRITE && inode->i_size < len)
                inode->i_size = len;
 out:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        return ret;
 }
@@ -530,7 +530,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
        if (hole_end > hole_start) {
                struct address_space *mapping = inode->i_mapping;
 
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
                i_mmap_lock_write(mapping);
                if (!RB_EMPTY_ROOT(&mapping->i_mmap))
                        hugetlb_vmdelete_list(&mapping->i_mmap,
@@ -538,7 +538,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
                                                hole_end  >> PAGE_SHIFT);
                i_mmap_unlock_write(mapping);
                remove_inode_hugepages(inode, hole_start, hole_end);
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
        }
 
        return 0;
@@ -572,7 +572,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
        start = offset >> hpage_shift;
        end = (offset + len + hpage_size - 1) >> hpage_shift;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
        error = inode_newsize_ok(inode, offset + len);
@@ -659,7 +659,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
                i_size_write(inode, offset + len);
        inode->i_ctime = CURRENT_TIME;
 out:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return error;
 }
 
index e491e54d243025999e42fdec674b082a765188fd..9f62db3bcc3efcc9cc678ed0d9bc88152f692ff8 100644 (file)
@@ -495,7 +495,7 @@ void clear_inode(struct inode *inode)
         */
        spin_lock_irq(&inode->i_data.tree_lock);
        BUG_ON(inode->i_data.nrpages);
-       BUG_ON(inode->i_data.nrshadows);
+       BUG_ON(inode->i_data.nrexceptional);
        spin_unlock_irq(&inode->i_data.tree_lock);
        BUG_ON(!list_empty(&inode->i_data.private_list));
        BUG_ON(!(inode->i_state & I_FREEING));
@@ -966,9 +966,9 @@ void lock_two_nondirectories(struct inode *inode1, struct inode *inode2)
                swap(inode1, inode2);
 
        if (inode1 && !S_ISDIR(inode1->i_mode))
-               mutex_lock(&inode1->i_mutex);
+               inode_lock(inode1);
        if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
-               mutex_lock_nested(&inode2->i_mutex, I_MUTEX_NONDIR2);
+               inode_lock_nested(inode2, I_MUTEX_NONDIR2);
 }
 EXPORT_SYMBOL(lock_two_nondirectories);
 
@@ -980,9 +980,9 @@ EXPORT_SYMBOL(lock_two_nondirectories);
 void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2)
 {
        if (inode1 && !S_ISDIR(inode1->i_mode))
-               mutex_unlock(&inode1->i_mutex);
+               inode_unlock(inode1);
        if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
-               mutex_unlock(&inode2->i_mutex);
+               inode_unlock(inode2);
 }
 EXPORT_SYMBOL(unlock_two_nondirectories);
 
index 29466c380958ac411f4dcd46706837f2c55b03cd..116a333e9c773c10b76a8030c268284869c2533e 100644 (file)
@@ -434,9 +434,9 @@ int generic_block_fiemap(struct inode *inode,
                         u64 len, get_block_t *get_block)
 {
        int ret;
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        ret = __generic_block_fiemap(inode, fieinfo, start, len, get_block);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return ret;
 }
 EXPORT_SYMBOL(generic_block_fiemap);
index a3750f902adcbae8f8cabbad066f9d629b805155..0ae91ad6df2dc916f077495876ec7fe1e47902ed 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/mtd/mtd.h>
+#include <linux/mm.h> /* kvfree() */
 #include "nodelist.h"
 
 static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *,
@@ -383,12 +384,7 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c)
        return 0;
 
  out_free:
-#ifndef __ECOS
-       if (jffs2_blocks_use_vmalloc(c))
-               vfree(c->blocks);
-       else
-#endif
-               kfree(c->blocks);
+       kvfree(c->blocks);
 
        return ret;
 }
index f509f62e12f6ef85e95b5c159353cba757ef4af1..c5ac5944bc1bd8ef1362e5911656f3bfb68ea70b 100644 (file)
@@ -39,10 +39,10 @@ int jffs2_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
        if (ret)
                return ret;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        /* Trigger GC to flush any pending writes for this inode */
        jffs2_flush_wbuf_gc(c, inode->i_ino);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        return 0;
 }
index 2caf1682036dc1914aaa887565b0a54edbe0ed35..bead25ae8fe4a563d257995102da2cbe7336eeb3 100644 (file)
@@ -596,10 +596,7 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
 out_root:
        jffs2_free_ino_caches(c);
        jffs2_free_raw_node_refs(c);
-       if (jffs2_blocks_use_vmalloc(c))
-               vfree(c->blocks);
-       else
-               kfree(c->blocks);
+       kvfree(c->blocks);
  out_inohash:
        jffs2_clear_xattr_subsystem(c);
        kfree(c->inocache_list);
index bb080c272149312ea382ee578ffdaa903620a43a..0a9a114bb9d1138b3060365986304cdb2df1428f 100644 (file)
@@ -331,10 +331,7 @@ static void jffs2_put_super (struct super_block *sb)
 
        jffs2_free_ino_caches(c);
        jffs2_free_raw_node_refs(c);
-       if (jffs2_blocks_use_vmalloc(c))
-               vfree(c->blocks);
-       else
-               kfree(c->blocks);
+       kvfree(c->blocks);
        jffs2_flash_cleanup(c);
        kfree(c->inocache_list);
        jffs2_clear_xattr_subsystem(c);
index 0e026a7bdcd4d9512b8074016b9f56a31de3b8f9..4ce7735dd042267bd086fd420066592e3b6855d3 100644 (file)
@@ -38,17 +38,17 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
        if (rc)
                return rc;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        if (!(inode->i_state & I_DIRTY_ALL) ||
            (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) {
                /* Make sure committed changes hit the disk */
                jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1);
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
                return rc;
        }
 
        rc |= jfs_commit_inode(inode, 1);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        return rc ? -EIO : 0;
 }
index 8db8b7d61e4048cf97ac289f263bc78c8685294d..8653cac7e12eed46e0bc360741004dfd3f1ce54d 100644 (file)
@@ -96,7 +96,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                }
 
                /* Lock against other parallel changes of flags */
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
 
                jfs_get_inode_flags(jfs_inode);
                oldflags = jfs_inode->mode2;
@@ -109,7 +109,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                        ((flags ^ oldflags) &
                        (JFS_APPEND_FL | JFS_IMMUTABLE_FL))) {
                        if (!capable(CAP_LINUX_IMMUTABLE)) {
-                               mutex_unlock(&inode->i_mutex);
+                               inode_unlock(inode);
                                err = -EPERM;
                                goto setflags_out;
                        }
@@ -120,7 +120,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                jfs_inode->mode2 = flags;
 
                jfs_set_inode_flags(inode);
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
                inode->i_ctime = CURRENT_TIME_SEC;
                mark_inode_dirty(inode);
 setflags_out:
index 900925b5eb8c225b151b58cf8133ee6a6e11e1b1..4f5d85ba8e237e91d3f314b330e53e26ad2cc22b 100644 (file)
@@ -792,7 +792,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type,
        struct buffer_head tmp_bh;
        struct buffer_head *bh;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        while (towrite > 0) {
                tocopy = sb->s_blocksize - offset < towrite ?
                                sb->s_blocksize - offset : towrite;
@@ -824,7 +824,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type,
        }
 out:
        if (len == towrite) {
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
                return err;
        }
        if (inode->i_size < off+len-towrite)
@@ -832,7 +832,7 @@ out:
        inode->i_version++;
        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
        mark_inode_dirty(inode);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return len - towrite;
 }
 
index 821973853340a106874da673618bf57bfbcc1178..996b7742c90b2eaf1e8b154ae3cc580293d6f94b 100644 (file)
@@ -1511,9 +1511,9 @@ static loff_t kernfs_dir_fop_llseek(struct file *file, loff_t offset,
        struct inode *inode = file_inode(file);
        loff_t ret;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        ret = generic_file_llseek(file, offset, whence);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        return ret;
 }
index 01491299f348c965adc27cbcd70340ab2d980946..0ca80b2af42015c309718b3328315471808cfa4c 100644 (file)
@@ -89,7 +89,7 @@ EXPORT_SYMBOL(dcache_dir_close);
 loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
 {
        struct dentry *dentry = file->f_path.dentry;
-       mutex_lock(&d_inode(dentry)->i_mutex);
+       inode_lock(d_inode(dentry));
        switch (whence) {
                case 1:
                        offset += file->f_pos;
@@ -97,7 +97,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
                        if (offset >= 0)
                                break;
                default:
-                       mutex_unlock(&d_inode(dentry)->i_mutex);
+                       inode_unlock(d_inode(dentry));
                        return -EINVAL;
        }
        if (offset != file->f_pos) {
@@ -124,7 +124,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
                        spin_unlock(&dentry->d_lock);
                }
        }
-       mutex_unlock(&d_inode(dentry)->i_mutex);
+       inode_unlock(d_inode(dentry));
        return offset;
 }
 EXPORT_SYMBOL(dcache_dir_lseek);
@@ -941,7 +941,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
        if (err)
                return err;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        ret = sync_mapping_buffers(inode->i_mapping);
        if (!(inode->i_state & I_DIRTY_ALL))
                goto out;
@@ -953,7 +953,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
                ret = err;
 
 out:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return ret;
 }
 EXPORT_SYMBOL(__generic_file_fsync);
index af1ed74a657fbc93f64386bcad54d9826bc34684..7c5f91be9b65c4ddabe441d00bbda616ca0c2f26 100644 (file)
@@ -1650,12 +1650,12 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
         * bother, maybe that's a sign this just isn't a good file to
         * hand out a delegation on.
         */
-       if (is_deleg && !mutex_trylock(&inode->i_mutex))
+       if (is_deleg && !inode_trylock(inode))
                return -EAGAIN;
 
        if (is_deleg && arg == F_WRLCK) {
                /* Write delegations are not currently supported: */
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
                WARN_ON_ONCE(1);
                return -EINVAL;
        }
@@ -1732,7 +1732,7 @@ out:
        spin_unlock(&ctx->flc_lock);
        locks_dispose_list(&dispose);
        if (is_deleg)
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
        if (!error && !my_fl)
                *flp = NULL;
        return error;
index 1a6f0167b16a7a4b657138ed7c366e8fe3151dfd..61eaeb1b6cac10e664539b2132607180d8fe24d0 100644 (file)
@@ -204,12 +204,12 @@ long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
                if (err)
                        return err;
 
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
                oldflags = li->li_flags;
                flags &= LOGFS_FL_USER_MODIFIABLE;
                flags |= oldflags & ~LOGFS_FL_USER_MODIFIABLE;
                li->li_flags = flags;
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
 
                inode->i_ctime = CURRENT_TIME;
                mark_inode_dirty_sync(inode);
@@ -230,11 +230,11 @@ int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
        if (ret)
                return ret;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        logfs_get_wblocks(sb, NULL, WF_LOCK);
        logfs_write_anchor(sb);
        logfs_put_wblocks(sb, NULL, WF_LOCK);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        return 0;
 }
index bceefd5588a2bf908626a0a6282a5f810ecfc5c1..f624d132e01e6f5488fd0d9af13007b015e1e178 100644 (file)
@@ -1629,9 +1629,9 @@ static int lookup_slow(struct nameidata *nd, struct path *path)
        parent = nd->path.dentry;
        BUG_ON(nd->inode != parent->d_inode);
 
-       mutex_lock(&parent->d_inode->i_mutex);
+       inode_lock(parent->d_inode);
        dentry = __lookup_hash(&nd->last, parent, nd->flags);
-       mutex_unlock(&parent->d_inode->i_mutex);
+       inode_unlock(parent->d_inode);
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
        path->mnt = nd->path.mnt;
@@ -2229,10 +2229,10 @@ struct dentry *kern_path_locked(const char *name, struct path *path)
                putname(filename);
                return ERR_PTR(-EINVAL);
        }
-       mutex_lock_nested(&path->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+       inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
        d = __lookup_hash(&last, path->dentry, 0);
        if (IS_ERR(d)) {
-               mutex_unlock(&path->dentry->d_inode->i_mutex);
+               inode_unlock(path->dentry->d_inode);
                path_put(path);
        }
        putname(filename);
@@ -2282,7 +2282,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
        unsigned int c;
        int err;
 
-       WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
+       WARN_ON_ONCE(!inode_is_locked(base->d_inode));
 
        this.name = name;
        this.len = len;
@@ -2380,9 +2380,9 @@ struct dentry *lookup_one_len_unlocked(const char *name,
        if (ret)
                return ret;
 
-       mutex_lock(&base->d_inode->i_mutex);
+       inode_lock(base->d_inode);
        ret =  __lookup_hash(&this, base, 0);
-       mutex_unlock(&base->d_inode->i_mutex);
+       inode_unlock(base->d_inode);
        return ret;
 }
 EXPORT_SYMBOL(lookup_one_len_unlocked);
@@ -2463,7 +2463,7 @@ mountpoint_last(struct nameidata *nd, struct path *path)
                goto done;
        }
 
-       mutex_lock(&dir->d_inode->i_mutex);
+       inode_lock(dir->d_inode);
        dentry = d_lookup(dir, &nd->last);
        if (!dentry) {
                /*
@@ -2473,16 +2473,16 @@ mountpoint_last(struct nameidata *nd, struct path *path)
                 */
                dentry = d_alloc(dir, &nd->last);
                if (!dentry) {
-                       mutex_unlock(&dir->d_inode->i_mutex);
+                       inode_unlock(dir->d_inode);
                        return -ENOMEM;
                }
                dentry = lookup_real(dir->d_inode, dentry, nd->flags);
                if (IS_ERR(dentry)) {
-                       mutex_unlock(&dir->d_inode->i_mutex);
+                       inode_unlock(dir->d_inode);
                        return PTR_ERR(dentry);
                }
        }
-       mutex_unlock(&dir->d_inode->i_mutex);
+       inode_unlock(dir->d_inode);
 
 done:
        if (d_is_negative(dentry)) {
@@ -2672,7 +2672,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
        struct dentry *p;
 
        if (p1 == p2) {
-               mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
+               inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
                return NULL;
        }
 
@@ -2680,29 +2680,29 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
 
        p = d_ancestor(p2, p1);
        if (p) {
-               mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT);
-               mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD);
+               inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
+               inode_lock_nested(p1->d_inode, I_MUTEX_CHILD);
                return p;
        }
 
        p = d_ancestor(p1, p2);
        if (p) {
-               mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
-               mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
+               inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
+               inode_lock_nested(p2->d_inode, I_MUTEX_CHILD);
                return p;
        }
 
-       mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
-       mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT2);
+       inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
+       inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
        return NULL;
 }
 EXPORT_SYMBOL(lock_rename);
 
 void unlock_rename(struct dentry *p1, struct dentry *p2)
 {
-       mutex_unlock(&p1->d_inode->i_mutex);
+       inode_unlock(p1->d_inode);
        if (p1 != p2) {
-               mutex_unlock(&p2->d_inode->i_mutex);
+               inode_unlock(p2->d_inode);
                mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
        }
 }
@@ -3141,9 +3141,9 @@ retry_lookup:
                 * dropping this one anyway.
                 */
        }
-       mutex_lock(&dir->d_inode->i_mutex);
+       inode_lock(dir->d_inode);
        error = lookup_open(nd, &path, file, op, got_write, opened);
-       mutex_unlock(&dir->d_inode->i_mutex);
+       inode_unlock(dir->d_inode);
 
        if (error <= 0) {
                if (error)
@@ -3489,7 +3489,7 @@ static struct dentry *filename_create(int dfd, struct filename *name,
         * Do the final lookup.
         */
        lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL;
-       mutex_lock_nested(&path->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+       inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
        dentry = __lookup_hash(&last, path->dentry, lookup_flags);
        if (IS_ERR(dentry))
                goto unlock;
@@ -3518,7 +3518,7 @@ fail:
        dput(dentry);
        dentry = ERR_PTR(error);
 unlock:
-       mutex_unlock(&path->dentry->d_inode->i_mutex);
+       inode_unlock(path->dentry->d_inode);
        if (!err2)
                mnt_drop_write(path->mnt);
 out:
@@ -3538,7 +3538,7 @@ EXPORT_SYMBOL(kern_path_create);
 void done_path_create(struct path *path, struct dentry *dentry)
 {
        dput(dentry);
-       mutex_unlock(&path->dentry->d_inode->i_mutex);
+       inode_unlock(path->dentry->d_inode);
        mnt_drop_write(path->mnt);
        path_put(path);
 }
@@ -3735,7 +3735,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
                return -EPERM;
 
        dget(dentry);
-       mutex_lock(&dentry->d_inode->i_mutex);
+       inode_lock(dentry->d_inode);
 
        error = -EBUSY;
        if (is_local_mountpoint(dentry))
@@ -3755,7 +3755,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
        detach_mounts(dentry);
 
 out:
-       mutex_unlock(&dentry->d_inode->i_mutex);
+       inode_unlock(dentry->d_inode);
        dput(dentry);
        if (!error)
                d_delete(dentry);
@@ -3794,7 +3794,7 @@ retry:
        if (error)
                goto exit1;
 
-       mutex_lock_nested(&path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+       inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
        dentry = __lookup_hash(&last, path.dentry, lookup_flags);
        error = PTR_ERR(dentry);
        if (IS_ERR(dentry))
@@ -3810,7 +3810,7 @@ retry:
 exit3:
        dput(dentry);
 exit2:
-       mutex_unlock(&path.dentry->d_inode->i_mutex);
+       inode_unlock(path.dentry->d_inode);
        mnt_drop_write(path.mnt);
 exit1:
        path_put(&path);
@@ -3856,7 +3856,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegate
        if (!dir->i_op->unlink)
                return -EPERM;
 
-       mutex_lock(&target->i_mutex);
+       inode_lock(target);
        if (is_local_mountpoint(dentry))
                error = -EBUSY;
        else {
@@ -3873,7 +3873,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegate
                }
        }
 out:
-       mutex_unlock(&target->i_mutex);
+       inode_unlock(target);
 
        /* We don't d_delete() NFS sillyrenamed files--they still exist. */
        if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
@@ -3916,7 +3916,7 @@ retry:
        if (error)
                goto exit1;
 retry_deleg:
-       mutex_lock_nested(&path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+       inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
        dentry = __lookup_hash(&last, path.dentry, lookup_flags);
        error = PTR_ERR(dentry);
        if (!IS_ERR(dentry)) {
@@ -3934,7 +3934,7 @@ retry_deleg:
 exit2:
                dput(dentry);
        }
-       mutex_unlock(&path.dentry->d_inode->i_mutex);
+       inode_unlock(path.dentry->d_inode);
        if (inode)
                iput(inode);    /* truncate the inode here */
        inode = NULL;
@@ -4086,7 +4086,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
        if (error)
                return error;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        /* Make sure we don't allow creating hardlink to an unlinked file */
        if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
                error =  -ENOENT;
@@ -4103,7 +4103,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
                inode->i_state &= ~I_LINKABLE;
                spin_unlock(&inode->i_lock);
        }
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        if (!error)
                fsnotify_link(dir, inode, new_dentry);
        return error;
@@ -4303,7 +4303,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (!is_dir || (flags & RENAME_EXCHANGE))
                lock_two_nondirectories(source, target);
        else if (target)
-               mutex_lock(&target->i_mutex);
+               inode_lock(target);
 
        error = -EBUSY;
        if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
@@ -4356,7 +4356,7 @@ out:
        if (!is_dir || (flags & RENAME_EXCHANGE))
                unlock_two_nondirectories(source, target);
        else if (target)
-               mutex_unlock(&target->i_mutex);
+               inode_unlock(target);
        dput(new_dentry);
        if (!error) {
                fsnotify_move(old_dir, new_dir, old_name, is_dir,
index a830e1463704b590b72947d313cd50b454555668..4fb1691b435552c044134b56aa2032da35684b5b 100644 (file)
@@ -1961,9 +1961,9 @@ static struct mountpoint *lock_mount(struct path *path)
        struct vfsmount *mnt;
        struct dentry *dentry = path->dentry;
 retry:
-       mutex_lock(&dentry->d_inode->i_mutex);
+       inode_lock(dentry->d_inode);
        if (unlikely(cant_mount(dentry))) {
-               mutex_unlock(&dentry->d_inode->i_mutex);
+               inode_unlock(dentry->d_inode);
                return ERR_PTR(-ENOENT);
        }
        namespace_lock();
@@ -1974,13 +1974,13 @@ retry:
                        mp = new_mountpoint(dentry);
                if (IS_ERR(mp)) {
                        namespace_unlock();
-                       mutex_unlock(&dentry->d_inode->i_mutex);
+                       inode_unlock(dentry->d_inode);
                        return mp;
                }
                return mp;
        }
        namespace_unlock();
-       mutex_unlock(&path->dentry->d_inode->i_mutex);
+       inode_unlock(path->dentry->d_inode);
        path_put(path);
        path->mnt = mnt;
        dentry = path->dentry = dget(mnt->mnt_root);
@@ -1992,7 +1992,7 @@ static void unlock_mount(struct mountpoint *where)
        struct dentry *dentry = where->m_dentry;
        put_mountpoint(where);
        namespace_unlock();
-       mutex_unlock(&dentry->d_inode->i_mutex);
+       inode_unlock(dentry->d_inode);
 }
 
 static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
index f0e3e9e747dd72aaa756611cb0c881d19b902d5a..26c2de2de13fd0ce7bb55287c58e268cf4bf37e6 100644 (file)
@@ -369,7 +369,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags)
        if (!res) {
                struct inode *inode = d_inode(dentry);
 
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
                if (finfo.i.dirEntNum == NCP_FINFO(inode)->dirEntNum) {
                        ncp_new_dentry(dentry);
                        val=1;
@@ -377,7 +377,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags)
                        ncp_dbg(2, "found, but dirEntNum changed\n");
 
                ncp_update_inode2(inode, &finfo);
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
        }
 
 finished:
@@ -639,9 +639,9 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
        } else {
                struct inode *inode = d_inode(newdent);
 
-               mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+               inode_lock_nested(inode, I_MUTEX_CHILD);
                ncp_update_inode2(inode, entry);
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
        }
 
        if (ctl.idx >= NCP_DIRCACHE_SIZE) {
index 011324ce9df21181d6983f1fe81c659e1120a814..dd38ca1f2ecb9a181668496cc056653e1e470be2 100644 (file)
@@ -224,10 +224,10 @@ ncp_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
        iocb->ki_pos = pos;
 
        if (pos > i_size_read(inode)) {
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
                if (pos > i_size_read(inode))
                        i_size_write(inode, pos);
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
        }
        ncp_dbg(1, "exit %pD2\n", file);
 outrel:
index c82a21228a342a8f4e6c42ea0d0366b225cb0c74..9cce67043f92a589193625ca3005d0b1e0f4e36b 100644 (file)
@@ -940,7 +940,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
        dfprintk(FILE, "NFS: llseek dir(%pD2, %lld, %d)\n",
                        filp, offset, whence);
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        switch (whence) {
                case 1:
                        offset += filp->f_pos;
@@ -957,7 +957,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
                dir_ctx->duped = 0;
        }
 out:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return offset;
 }
 
@@ -972,9 +972,9 @@ static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end,
 
        dfprintk(FILE, "NFS: fsync dir(%pD2) datasync %d\n", filp, datasync);
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return 0;
 }
 
index 7ab7ec9f4eed8a0212d2079d7c2c79cb331d7a20..7a0cfd3266e561620577bef3cf449171f10d9f92 100644 (file)
@@ -580,7 +580,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
        if (!count)
                goto out;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        result = nfs_sync_mapping(mapping);
        if (result)
                goto out_unlock;
@@ -608,7 +608,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
        NFS_I(inode)->read_io += count;
        result = nfs_direct_read_schedule_iovec(dreq, iter, pos);
 
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        if (!result) {
                result = nfs_direct_wait(dreq);
@@ -622,7 +622,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
 out_release:
        nfs_direct_req_release(dreq);
 out_unlock:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 out:
        return result;
 }
@@ -1005,7 +1005,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
        pos = iocb->ki_pos;
        end = (pos + iov_iter_count(iter) - 1) >> PAGE_CACHE_SHIFT;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        result = nfs_sync_mapping(mapping);
        if (result)
@@ -1045,7 +1045,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
                                              pos >> PAGE_CACHE_SHIFT, end);
        }
 
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        if (!result) {
                result = nfs_direct_wait(dreq);
@@ -1066,7 +1066,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
 out_release:
        nfs_direct_req_release(dreq);
 out_unlock:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return result;
 }
 
index 4ef8f5addcadf8a7d8c67dbb4bd2a9583d26a487..748bb813b8ecd63095f2f50b8e733fe960e188ec 100644 (file)
@@ -278,9 +278,9 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
                ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
                if (ret != 0)
                        break;
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
                ret = nfs_file_fsync_commit(file, start, end, datasync);
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
                /*
                 * If nfs_file_fsync_commit detected a server reboot, then
                 * resend all dirty pages that might have been covered by
index bb1f4e7a3270e0ff641ebea34c51f3080e01354c..3384dc8e66836c1fe4bc4e0a507a7e9250c5e719 100644 (file)
@@ -971,7 +971,7 @@ filelayout_mark_request_commit(struct nfs_page *req,
        u32 i, j;
 
        if (fl->commit_through_mds) {
-               nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo);
+               nfs_request_add_commit_list(req, cinfo);
        } else {
                /* Note that we are calling nfs4_fl_calc_j_index on each page
                 * that ends up being committed to a data server.  An attractive
index 6594e9f903a0315dde75ac3fbccecf64521d22e9..5bcd92d50e820ab6ccfa5e7789ee01152176b857 100644 (file)
@@ -1948,11 +1948,9 @@ ff_layout_encode_layoutreturn(struct pnfs_layout_hdr *lo,
        start = xdr_reserve_space(xdr, 4);
        BUG_ON(!start);
 
-       if (ff_layout_encode_ioerr(flo, xdr, args))
-               goto out;
-
+       ff_layout_encode_ioerr(flo, xdr, args);
        ff_layout_encode_iostats(flo, xdr, args);
-out:
+
        *start = cpu_to_be32((xdr->p - start - 1) * 4);
        dprintk("%s: Return\n", __func__);
 }
index bd0327541366c2066b7720933373d0c6758ad4e0..29898a9550fa6f72541cbedc7ac1f0270c5a7c5c 100644 (file)
@@ -218,63 +218,55 @@ static void extend_ds_error(struct nfs4_ff_layout_ds_err *err,
        err->length = end - err->offset;
 }
 
-static bool ds_error_can_merge(struct nfs4_ff_layout_ds_err *err,  u64 offset,
-                              u64 length, int status, enum nfs_opnum4 opnum,
-                              nfs4_stateid *stateid,
-                              struct nfs4_deviceid *deviceid)
+static int
+ff_ds_error_match(const struct nfs4_ff_layout_ds_err *e1,
+               const struct nfs4_ff_layout_ds_err *e2)
 {
-       return err->status == status && err->opnum == opnum &&
-              nfs4_stateid_match(&err->stateid, stateid) &&
-              !memcmp(&err->deviceid, deviceid, sizeof(*deviceid)) &&
-              end_offset(err->offset, err->length) >= offset &&
-              err->offset <= end_offset(offset, length);
-}
-
-static bool merge_ds_error(struct nfs4_ff_layout_ds_err *old,
-                          struct nfs4_ff_layout_ds_err *new)
-{
-       if (!ds_error_can_merge(old, new->offset, new->length, new->status,
-                               new->opnum, &new->stateid, &new->deviceid))
-               return false;
-
-       extend_ds_error(old, new->offset, new->length);
-       return true;
+       int ret;
+
+       if (e1->opnum != e2->opnum)
+               return e1->opnum < e2->opnum ? -1 : 1;
+       if (e1->status != e2->status)
+               return e1->status < e2->status ? -1 : 1;
+       ret = memcmp(&e1->stateid, &e2->stateid, sizeof(e1->stateid));
+       if (ret != 0)
+               return ret;
+       ret = memcmp(&e1->deviceid, &e2->deviceid, sizeof(e1->deviceid));
+       if (ret != 0)
+               return ret;
+       if (end_offset(e1->offset, e1->length) < e2->offset)
+               return -1;
+       if (e1->offset > end_offset(e2->offset, e2->length))
+               return 1;
+       /* If ranges overlap or are contiguous, they are the same */
+       return 0;
 }
 
-static bool
+static void
 ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo,
                              struct nfs4_ff_layout_ds_err *dserr)
 {
-       struct nfs4_ff_layout_ds_err *err;
-
-       list_for_each_entry(err, &flo->error_list, list) {
-               if (merge_ds_error(err, dserr)) {
-                       return true;
-               }
-       }
-
-       list_add(&dserr->list, &flo->error_list);
-       return false;
-}
-
-static bool
-ff_layout_update_ds_error(struct nfs4_flexfile_layout *flo, u64 offset,
-                         u64 length, int status, enum nfs_opnum4 opnum,
-                         nfs4_stateid *stateid, struct nfs4_deviceid *deviceid)
-{
-       bool found = false;
-       struct nfs4_ff_layout_ds_err *err;
-
-       list_for_each_entry(err, &flo->error_list, list) {
-               if (ds_error_can_merge(err, offset, length, status, opnum,
-                                      stateid, deviceid)) {
-                       found = true;
-                       extend_ds_error(err, offset, length);
+       struct nfs4_ff_layout_ds_err *err, *tmp;
+       struct list_head *head = &flo->error_list;
+       int match;
+
+       /* Do insertion sort w/ merges */
+       list_for_each_entry_safe(err, tmp, &flo->error_list, list) {
+               match = ff_ds_error_match(err, dserr);
+               if (match < 0)
+                       continue;
+               if (match > 0) {
+                       /* Add entry "dserr" _before_ entry "err" */
+                       head = &err->list;
                        break;
                }
+               /* Entries match, so merge "err" into "dserr" */
+               extend_ds_error(dserr, err->offset, err->length);
+               list_del(&err->list);
+               kfree(err);
        }
 
-       return found;
+       list_add_tail(&dserr->list, head);
 }
 
 int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
@@ -283,7 +275,6 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
                             gfp_t gfp_flags)
 {
        struct nfs4_ff_layout_ds_err *dserr;
-       bool needfree;
 
        if (status == 0)
                return 0;
@@ -291,14 +282,6 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
        if (mirror->mirror_ds == NULL)
                return -EINVAL;
 
-       spin_lock(&flo->generic_hdr.plh_inode->i_lock);
-       if (ff_layout_update_ds_error(flo, offset, length, status, opnum,
-                                     &mirror->stateid,
-                                     &mirror->mirror_ds->id_node.deviceid)) {
-               spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
-               return 0;
-       }
-       spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
        dserr = kmalloc(sizeof(*dserr), gfp_flags);
        if (!dserr)
                return -ENOMEM;
@@ -313,10 +296,8 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
               NFS4_DEVICEID4_SIZE);
 
        spin_lock(&flo->generic_hdr.plh_inode->i_lock);
-       needfree = ff_layout_add_ds_error_locked(flo, dserr);
+       ff_layout_add_ds_error_locked(flo, dserr);
        spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
-       if (needfree)
-               kfree(dserr);
 
        return 0;
 }
index 8e24d886d2c5991c044965c3fa3093f3142632b8..86faecf8f328f2639bab1c5f7391df6d659610f2 100644 (file)
@@ -661,9 +661,9 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
        trace_nfs_getattr_enter(inode);
        /* Flush out writes to the server in order to update c/mtime.  */
        if (S_ISREG(inode->i_mode)) {
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
                err = nfs_sync_inode(inode);
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
                if (err)
                        goto out;
        }
@@ -1178,9 +1178,9 @@ static int __nfs_revalidate_mapping(struct inode *inode,
        spin_unlock(&inode->i_lock);
        trace_nfs_invalidate_mapping_enter(inode);
        if (may_lock) {
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
                ret = nfs_invalidate_mapping(inode, mapping);
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
        } else
                ret = nfs_invalidate_mapping(inode, mapping);
        trace_nfs_invalidate_mapping_exit(inode, ret);
index 4e8cc942336c83dcbf9b5b615eb21dfa14be758d..9a547aa3ec8e873160da5a36906eb3813650034d 100644 (file)
@@ -484,7 +484,7 @@ void nfs_retry_commit(struct list_head *page_list,
                      struct nfs_commit_info *cinfo,
                      u32 ds_commit_idx);
 void nfs_commitdata_release(struct nfs_commit_data *data);
-void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
+void nfs_request_add_commit_list(struct nfs_page *req,
                                 struct nfs_commit_info *cinfo);
 void nfs_request_add_commit_list_locked(struct nfs_page *req,
                struct list_head *dst,
index 6e8174930a48eb705979c93fca0d674fba72c0e7..bd25dc7077f72f5a79dfbf81e27f15a75a85dd92 100644 (file)
@@ -101,13 +101,13 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len)
        if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE))
                return -EOPNOTSUPP;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        err = nfs42_proc_fallocate(&msg, filep, offset, len);
        if (err == -EOPNOTSUPP)
                NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE;
 
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return err;
 }
 
@@ -123,7 +123,7 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
                return -EOPNOTSUPP;
 
        nfs_wb_all(inode);
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        err = nfs42_proc_fallocate(&msg, filep, offset, len);
        if (err == 0)
@@ -131,7 +131,7 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
        if (err == -EOPNOTSUPP)
                NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE;
 
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return err;
 }
 
index 26f9a23e2b254998d6b0c053e2e0892d2cd16050..57ca1c8039c1e00bfcb6e9bb4d8fe7e40f4578d5 100644 (file)
@@ -141,11 +141,11 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
                ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
                if (ret != 0)
                        break;
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
                ret = nfs_file_fsync_commit(file, start, end, datasync);
                if (!ret)
                        ret = pnfs_sync_inode(inode, !!datasync);
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
                /*
                 * If nfs_file_fsync_commit detected a server reboot, then
                 * resend all dirty pages that might have been covered by
@@ -219,13 +219,13 @@ static int nfs42_clone_file_range(struct file *src_file, loff_t src_off,
 
        /* XXX: do we lock at all? what if server needs CB_RECALL_LAYOUT? */
        if (same_inode) {
-               mutex_lock(&src_inode->i_mutex);
+               inode_lock(src_inode);
        } else if (dst_inode < src_inode) {
-               mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_PARENT);
-               mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_CHILD);
+               inode_lock_nested(dst_inode, I_MUTEX_PARENT);
+               inode_lock_nested(src_inode, I_MUTEX_CHILD);
        } else {
-               mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_PARENT);
-               mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_CHILD);
+               inode_lock_nested(src_inode, I_MUTEX_PARENT);
+               inode_lock_nested(dst_inode, I_MUTEX_CHILD);
        }
 
        /* flush all pending writes on both src and dst so that server
@@ -246,13 +246,13 @@ static int nfs42_clone_file_range(struct file *src_file, loff_t src_off,
 
 out_unlock:
        if (same_inode) {
-               mutex_unlock(&src_inode->i_mutex);
+               inode_unlock(src_inode);
        } else if (dst_inode < src_inode) {
-               mutex_unlock(&src_inode->i_mutex);
-               mutex_unlock(&dst_inode->i_mutex);
+               inode_unlock(src_inode);
+               inode_unlock(dst_inode);
        } else {
-               mutex_unlock(&dst_inode->i_mutex);
-               mutex_unlock(&src_inode->i_mutex);
+               inode_unlock(dst_inode);
+               inode_unlock(src_inode);
        }
 out:
        return ret;
index ce43cd6d88c634773ae86a2922d3700d6f97ad33..5754835a288608bd7dc3b58b8f92a2a6ee8a20bc 100644 (file)
@@ -830,11 +830,10 @@ EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked);
  * holding the nfs_page lock.
  */
 void
-nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
-                           struct nfs_commit_info *cinfo)
+nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo)
 {
        spin_lock(cinfo->lock);
-       nfs_request_add_commit_list_locked(req, dst, cinfo);
+       nfs_request_add_commit_list_locked(req, &cinfo->mds->list, cinfo);
        spin_unlock(cinfo->lock);
        nfs_mark_page_unstable(req->wb_page, cinfo);
 }
@@ -892,7 +891,7 @@ nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
 {
        if (pnfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx))
                return;
-       nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo);
+       nfs_request_add_commit_list(req, cinfo);
 }
 
 static void
index 819ad812c71b903200784f7c65622ff07ce59d72..4cba7865f4966b825e3d47c7d423fdb8ca11a262 100644 (file)
@@ -55,10 +55,10 @@ nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u
        struct inode *inode = d_inode(resfh->fh_dentry);
        int status;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        status = security_inode_setsecctx(resfh->fh_dentry,
                label->data, label->len);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        if (status)
                /*
index 79f0307a5ec8331426ef331c5c843d59966dd5ff..dc8ebecf561866a2d6fcee729813528852be6313 100644 (file)
@@ -192,7 +192,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 
        dir = nn->rec_file->f_path.dentry;
        /* lock the parent */
-       mutex_lock(&d_inode(dir)->i_mutex);
+       inode_lock(d_inode(dir));
 
        dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1);
        if (IS_ERR(dentry)) {
@@ -213,7 +213,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 out_put:
        dput(dentry);
 out_unlock:
-       mutex_unlock(&d_inode(dir)->i_mutex);
+       inode_unlock(d_inode(dir));
        if (status == 0) {
                if (nn->in_grace) {
                        crp = nfs4_client_to_reclaim(dname, nn);
@@ -286,7 +286,7 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
        }
 
        status = iterate_dir(nn->rec_file, &ctx.ctx);
-       mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
+       inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
 
        list_for_each_entry_safe(entry, tmp, &ctx.names, list) {
                if (!status) {
@@ -302,7 +302,7 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
                list_del(&entry->list);
                kfree(entry);
        }
-       mutex_unlock(&d_inode(dir)->i_mutex);
+       inode_unlock(d_inode(dir));
        nfs4_reset_creds(original_cred);
 
        list_for_each_entry_safe(entry, tmp, &ctx.names, list) {
@@ -322,7 +322,7 @@ nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn)
        dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name);
 
        dir = nn->rec_file->f_path.dentry;
-       mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
+       inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
        dentry = lookup_one_len(name, dir, namlen);
        if (IS_ERR(dentry)) {
                status = PTR_ERR(dentry);
@@ -335,7 +335,7 @@ nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn)
 out:
        dput(dentry);
 out_unlock:
-       mutex_unlock(&d_inode(dir)->i_mutex);
+       inode_unlock(d_inode(dir));
        return status;
 }
 
index 0770bcb543c81d843106825a2c9c0da6be3d15a4..f84fe6bf9aee144cd7227711540f0027e24ef6c2 100644 (file)
@@ -288,7 +288,7 @@ fh_lock_nested(struct svc_fh *fhp, unsigned int subclass)
        }
 
        inode = d_inode(dentry);
-       mutex_lock_nested(&inode->i_mutex, subclass);
+       inode_lock_nested(inode, subclass);
        fill_pre_wcc(fhp);
        fhp->fh_locked = true;
 }
@@ -307,7 +307,7 @@ fh_unlock(struct svc_fh *fhp)
 {
        if (fhp->fh_locked) {
                fill_post_wcc(fhp);
-               mutex_unlock(&d_inode(fhp->fh_dentry)->i_mutex);
+               inode_unlock(d_inode(fhp->fh_dentry));
                fhp->fh_locked = false;
        }
 }
index 6739077f17fe7827beee7b7469b2c06450862362..5d2a57e4c03ad3494e0b08fc3dfbc0e13fb192df 100644 (file)
@@ -493,9 +493,9 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
 
        dentry = fhp->fh_dentry;
 
-       mutex_lock(&d_inode(dentry)->i_mutex);
+       inode_lock(d_inode(dentry));
        host_error = security_inode_setsecctx(dentry, label->data, label->len);
-       mutex_unlock(&d_inode(dentry)->i_mutex);
+       inode_unlock(d_inode(dentry));
        return nfserrno(host_error);
 }
 #else
index 10b22527a617dcdefc36d36ce4fc3d3658e20330..21a1e2e0d92fe22696ebac0bd702238fb95ec02f 100644 (file)
@@ -1003,7 +1003,7 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        if (ret)
                return ret;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        isize = i_size_read(inode);
 
@@ -1113,6 +1113,6 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        if (ret == 1)
                ret = 0;
 
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return ret;
 }
index aba43811d6ef1221c68b275afda0174f072343bf..e8fe24882b5baa417e3002ada770882f4771534b 100644 (file)
@@ -158,7 +158,7 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp,
 
        flags = nilfs_mask_flags(inode->i_mode, flags);
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        oldflags = NILFS_I(inode)->i_flags;
 
@@ -186,7 +186,7 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp,
        nilfs_mark_inode_dirty(inode);
        ret = nilfs_transaction_commit(inode->i_sb);
 out:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        mnt_drop_write_file(filp);
        return ret;
 }
index 9e38dafa3bc78ec71acc7acec733d9cd52c4f40a..b2eff5816adc3e915f03665c4b283f116a7b1eab 100644 (file)
@@ -1509,7 +1509,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
        err = filemap_write_and_wait_range(vi->i_mapping, start, end);
        if (err)
                return err;
-       mutex_lock(&vi->i_mutex);
+       inode_lock(vi);
 
        BUG_ON(!S_ISDIR(vi->i_mode));
        /* If the bitmap attribute inode is in memory sync it, too. */
@@ -1532,7 +1532,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
        else
                ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx.  Error "
                                "%u.", datasync ? "data" : "", vi->i_ino, -ret);
-       mutex_unlock(&vi->i_mutex);
+       inode_unlock(vi);
        return ret;
 }
 
index 9d383e5eff0ea5519bffb34528b5aeccbe6f3831..bed4d427dfaee110c7da84c70f32c673fbdc80aa 100644 (file)
@@ -1944,14 +1944,14 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
        ssize_t written = 0;
        ssize_t err;
 
-       mutex_lock(&vi->i_mutex);
+       inode_lock(vi);
        /* We can write back this queue in page reclaim. */
        current->backing_dev_info = inode_to_bdi(vi);
        err = ntfs_prepare_file_for_write(iocb, from);
        if (iov_iter_count(from) && !err)
                written = ntfs_perform_write(file, from, iocb->ki_pos);
        current->backing_dev_info = NULL;
-       mutex_unlock(&vi->i_mutex);
+       inode_unlock(vi);
        if (likely(written > 0)) {
                err = generic_write_sync(file, iocb->ki_pos, written);
                if (err < 0)
@@ -1996,7 +1996,7 @@ static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end,
        err = filemap_write_and_wait_range(vi->i_mapping, start, end);
        if (err)
                return err;
-       mutex_lock(&vi->i_mutex);
+       inode_lock(vi);
 
        BUG_ON(S_ISDIR(vi->i_mode));
        if (!datasync || !NInoNonResident(NTFS_I(vi)))
@@ -2015,7 +2015,7 @@ static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end,
        else
                ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx.  Error "
                                "%u.", datasync ? "data" : "", vi->i_ino, -ret);
-       mutex_unlock(&vi->i_mutex);
+       inode_unlock(vi);
        return ret;
 }
 
index d80e3315cab0ec73cfdd9b668594d588f559273e..9793e68ba1ddf47091d0dc80a324dfe27a39dbc1 100644 (file)
@@ -48,7 +48,7 @@ bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol)
                ntfs_error(vol->sb, "Quota inodes are not open.");
                return false;
        }
-       mutex_lock(&vol->quota_q_ino->i_mutex);
+       inode_lock(vol->quota_q_ino);
        ictx = ntfs_index_ctx_get(NTFS_I(vol->quota_q_ino));
        if (!ictx) {
                ntfs_error(vol->sb, "Failed to get index context.");
@@ -98,7 +98,7 @@ bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol)
        ntfs_index_entry_mark_dirty(ictx);
 set_done:
        ntfs_index_ctx_put(ictx);
-       mutex_unlock(&vol->quota_q_ino->i_mutex);
+       inode_unlock(vol->quota_q_ino);
        /*
         * We set the flag so we do not try to mark the quotas out of date
         * again on remount.
@@ -110,7 +110,7 @@ done:
 err_out:
        if (ictx)
                ntfs_index_ctx_put(ictx);
-       mutex_unlock(&vol->quota_q_ino->i_mutex);
+       inode_unlock(vol->quota_q_ino);
        return false;
 }
 
index 2f77f8dfb861415c9e5196c129d6d4902e042ce8..1b38abdaa3ed3da7522f461ae49a77ccf5ee5982 100644 (file)
@@ -1284,10 +1284,10 @@ static int check_windows_hibernation_status(ntfs_volume *vol)
         * Find the inode number for the hibernation file by looking up the
         * filename hiberfil.sys in the root directory.
         */
-       mutex_lock(&vol->root_ino->i_mutex);
+       inode_lock(vol->root_ino);
        mref = ntfs_lookup_inode_by_name(NTFS_I(vol->root_ino), hiberfil, 12,
                        &name);
-       mutex_unlock(&vol->root_ino->i_mutex);
+       inode_unlock(vol->root_ino);
        if (IS_ERR_MREF(mref)) {
                ret = MREF_ERR(mref);
                /* If the file does not exist, Windows is not hibernated. */
@@ -1377,10 +1377,10 @@ static bool load_and_init_quota(ntfs_volume *vol)
         * Find the inode number for the quota file by looking up the filename
         * $Quota in the extended system files directory $Extend.
         */
-       mutex_lock(&vol->extend_ino->i_mutex);
+       inode_lock(vol->extend_ino);
        mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), Quota, 6,
                        &name);
-       mutex_unlock(&vol->extend_ino->i_mutex);
+       inode_unlock(vol->extend_ino);
        if (IS_ERR_MREF(mref)) {
                /*
                 * If the file does not exist, quotas are disabled and have
@@ -1460,10 +1460,10 @@ static bool load_and_init_usnjrnl(ntfs_volume *vol)
         * Find the inode number for the transaction log file by looking up the
         * filename $UsnJrnl in the extended system files directory $Extend.
         */
-       mutex_lock(&vol->extend_ino->i_mutex);
+       inode_lock(vol->extend_ino);
        mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), UsnJrnl, 8,
                        &name);
-       mutex_unlock(&vol->extend_ino->i_mutex);
+       inode_unlock(vol->extend_ino);
        if (IS_ERR_MREF(mref)) {
                /*
                 * If the file does not exist, transaction logging is disabled,
index a3ded88718c93eec4bf9767018826c960285a07d..d002579c6f2b03a62209110a1ce84e7605d2651a 100644 (file)
@@ -5719,7 +5719,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
                goto bail;
        }
 
-       mutex_lock(&tl_inode->i_mutex);
+       inode_lock(tl_inode);
 
        if (ocfs2_truncate_log_needs_flush(osb)) {
                ret = __ocfs2_flush_truncate_log(osb);
@@ -5776,7 +5776,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
 out_commit:
        ocfs2_commit_trans(osb, handle);
 out:
-       mutex_unlock(&tl_inode->i_mutex);
+       inode_unlock(tl_inode);
 bail:
        if (meta_ac)
                ocfs2_free_alloc_context(meta_ac);
@@ -5832,7 +5832,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
        struct ocfs2_dinode *di;
        struct ocfs2_truncate_log *tl;
 
-       BUG_ON(mutex_trylock(&tl_inode->i_mutex));
+       BUG_ON(inode_trylock(tl_inode));
 
        start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
 
@@ -5980,7 +5980,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
        struct ocfs2_dinode *di;
        struct ocfs2_truncate_log *tl;
 
-       BUG_ON(mutex_trylock(&tl_inode->i_mutex));
+       BUG_ON(inode_trylock(tl_inode));
 
        di = (struct ocfs2_dinode *) tl_bh->b_data;
 
@@ -6008,7 +6008,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
                goto out;
        }
 
-       mutex_lock(&data_alloc_inode->i_mutex);
+       inode_lock(data_alloc_inode);
 
        status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
        if (status < 0) {
@@ -6035,7 +6035,7 @@ out_unlock:
        ocfs2_inode_unlock(data_alloc_inode, 1);
 
 out_mutex:
-       mutex_unlock(&data_alloc_inode->i_mutex);
+       inode_unlock(data_alloc_inode);
        iput(data_alloc_inode);
 
 out:
@@ -6047,9 +6047,9 @@ int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
        int status;
        struct inode *tl_inode = osb->osb_tl_inode;
 
-       mutex_lock(&tl_inode->i_mutex);
+       inode_lock(tl_inode);
        status = __ocfs2_flush_truncate_log(osb);
-       mutex_unlock(&tl_inode->i_mutex);
+       inode_unlock(tl_inode);
 
        return status;
 }
@@ -6208,7 +6208,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
                (unsigned long long)le64_to_cpu(tl_copy->i_blkno),
                num_recs);
 
-       mutex_lock(&tl_inode->i_mutex);
+       inode_lock(tl_inode);
        for(i = 0; i < num_recs; i++) {
                if (ocfs2_truncate_log_needs_flush(osb)) {
                        status = __ocfs2_flush_truncate_log(osb);
@@ -6239,7 +6239,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
        }
 
 bail_up:
-       mutex_unlock(&tl_inode->i_mutex);
+       inode_unlock(tl_inode);
 
        return status;
 }
@@ -6346,7 +6346,7 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
                goto out;
        }
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        ret = ocfs2_inode_lock(inode, &di_bh, 1);
        if (ret) {
@@ -6395,7 +6395,7 @@ out_unlock:
        ocfs2_inode_unlock(inode, 1);
        brelse(di_bh);
 out_mutex:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        iput(inode);
 out:
        while(head) {
@@ -6439,7 +6439,7 @@ static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
        handle_t *handle;
        int ret = 0;
 
-       mutex_lock(&tl_inode->i_mutex);
+       inode_lock(tl_inode);
 
        while (head) {
                if (ocfs2_truncate_log_needs_flush(osb)) {
@@ -6471,7 +6471,7 @@ static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
                }
        }
 
-       mutex_unlock(&tl_inode->i_mutex);
+       inode_unlock(tl_inode);
 
        while (head) {
                /* Premature exit may have left some dangling items. */
@@ -7355,7 +7355,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
                goto out;
        }
 
-       mutex_lock(&main_bm_inode->i_mutex);
+       inode_lock(main_bm_inode);
 
        ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
        if (ret < 0) {
@@ -7422,7 +7422,7 @@ out_unlock:
        ocfs2_inode_unlock(main_bm_inode, 0);
        brelse(main_bm_bh);
 out_mutex:
-       mutex_unlock(&main_bm_inode->i_mutex);
+       inode_unlock(main_bm_inode);
        iput(main_bm_inode);
 out:
        return ret;
index 7f604727f487b3680d7f145161288656dd1aaf80..794fd1587f34a3bc210d60770525a02f3cd59aba 100644 (file)
@@ -2046,9 +2046,9 @@ static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
        int ret = 0;
        unsigned int truncated_clusters;
 
-       mutex_lock(&osb->osb_tl_inode->i_mutex);
+       inode_lock(osb->osb_tl_inode);
        truncated_clusters = osb->truncated_clusters;
-       mutex_unlock(&osb->osb_tl_inode->i_mutex);
+       inode_unlock(osb->osb_tl_inode);
 
        /*
         * Check whether we can succeed in allocating if we free
index ffecf89c8c1cd23d532a9155660a6609e9d38551..e1adf285fc3142ab8c0856ea0a28d766c914ce3f 100644 (file)
@@ -4361,7 +4361,7 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
                mlog_errno(ret);
                goto out;
        }
-       mutex_lock(&dx_alloc_inode->i_mutex);
+       inode_lock(dx_alloc_inode);
 
        ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1);
        if (ret) {
@@ -4410,7 +4410,7 @@ out_unlock:
        ocfs2_inode_unlock(dx_alloc_inode, 1);
 
 out_mutex:
-       mutex_unlock(&dx_alloc_inode->i_mutex);
+       inode_unlock(dx_alloc_inode);
        brelse(dx_alloc_bh);
 out:
        iput(dx_alloc_inode);
index f92612e4b9d61144b13021c83dd8b4b1e9553ac5..474e57f834e6caff12eaca03c8ece4d35303d1e0 100644 (file)
@@ -1390,6 +1390,7 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
        unsigned int gen;
        int noqueue_attempted = 0;
        int dlm_locked = 0;
+       int kick_dc = 0;
 
        if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) {
                mlog_errno(-EINVAL);
@@ -1524,7 +1525,12 @@ update_holders:
 unlock:
        lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
 
+       /* ocfs2_unblock_lock reques on seeing OCFS2_LOCK_UPCONVERT_FINISHING */
+       kick_dc = (lockres->l_flags & OCFS2_LOCK_BLOCKED);
+
        spin_unlock_irqrestore(&lockres->l_lock, flags);
+       if (kick_dc)
+               ocfs2_wake_downconvert_thread(osb);
 out:
        /*
         * This is helping work around a lock inversion between the page lock
index d6312793250927b1efacf4f09c18c38f7da75e5d..7cb38fdca229f050f89aed34c93c85db11a99ff8 100644 (file)
@@ -1872,7 +1872,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
        if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
                return -EROFS;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        /*
         * This prevents concurrent writes on other nodes
@@ -1991,7 +1991,7 @@ out_rw_unlock:
        ocfs2_rw_unlock(inode, 1);
 
 out:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return ret;
 }
 
@@ -2299,7 +2299,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
        appending = iocb->ki_flags & IOCB_APPEND ? 1 : 0;
        direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
 relock:
        /*
@@ -2435,7 +2435,7 @@ out:
                ocfs2_rw_unlock(inode, rw_level);
 
 out_mutex:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        if (written)
                ret = written;
@@ -2547,7 +2547,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
        struct inode *inode = file->f_mapping->host;
        int ret = 0;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        switch (whence) {
        case SEEK_SET:
@@ -2585,7 +2585,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
        offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
 
 out:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        if (ret)
                return ret;
        return offset;
index 97a563bab9a871ee3e371a9796b990c24835f47f..36294446d9607ee9f0d07814552fd6f0bf97e6ab 100644 (file)
@@ -630,10 +630,10 @@ static int ocfs2_remove_inode(struct inode *inode,
                goto bail;
        }
 
-       mutex_lock(&inode_alloc_inode->i_mutex);
+       inode_lock(inode_alloc_inode);
        status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1);
        if (status < 0) {
-               mutex_unlock(&inode_alloc_inode->i_mutex);
+               inode_unlock(inode_alloc_inode);
 
                mlog_errno(status);
                goto bail;
@@ -680,7 +680,7 @@ bail_commit:
        ocfs2_commit_trans(osb, handle);
 bail_unlock:
        ocfs2_inode_unlock(inode_alloc_inode, 1);
-       mutex_unlock(&inode_alloc_inode->i_mutex);
+       inode_unlock(inode_alloc_inode);
        brelse(inode_alloc_bh);
 bail:
        iput(inode_alloc_inode);
@@ -751,10 +751,10 @@ static int ocfs2_wipe_inode(struct inode *inode,
                /* Lock the orphan dir. The lock will be held for the entire
                 * delete_inode operation. We do this now to avoid races with
                 * recovery completion on other nodes. */
-               mutex_lock(&orphan_dir_inode->i_mutex);
+               inode_lock(orphan_dir_inode);
                status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
                if (status < 0) {
-                       mutex_unlock(&orphan_dir_inode->i_mutex);
+                       inode_unlock(orphan_dir_inode);
 
                        mlog_errno(status);
                        goto bail;
@@ -803,7 +803,7 @@ bail_unlock_dir:
                return status;
 
        ocfs2_inode_unlock(orphan_dir_inode, 1);
-       mutex_unlock(&orphan_dir_inode->i_mutex);
+       inode_unlock(orphan_dir_inode);
        brelse(orphan_dir_bh);
 bail:
        iput(orphan_dir_inode);
index 16b0bb482ea7cb68540c8fe7ea133b1bc89185ce..4506ec5ec2ea624235a36c06cb755ef2211e0914 100644 (file)
@@ -86,7 +86,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
        unsigned oldflags;
        int status;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        status = ocfs2_inode_lock(inode, &bh, 1);
        if (status < 0) {
@@ -135,7 +135,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
 bail_unlock:
        ocfs2_inode_unlock(inode, 1);
 bail:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        brelse(bh);
 
@@ -287,7 +287,7 @@ static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
        struct ocfs2_dinode *dinode_alloc = NULL;
 
        if (inode_alloc)
-               mutex_lock(&inode_alloc->i_mutex);
+               inode_lock(inode_alloc);
 
        if (o2info_coherent(&fi->ifi_req)) {
                status = ocfs2_inode_lock(inode_alloc, &bh, 0);
@@ -317,7 +317,7 @@ bail:
                ocfs2_inode_unlock(inode_alloc, 0);
 
        if (inode_alloc)
-               mutex_unlock(&inode_alloc->i_mutex);
+               inode_unlock(inode_alloc);
 
        brelse(bh);
 
@@ -547,7 +547,7 @@ static int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb,
        struct ocfs2_dinode *gb_dinode = NULL;
 
        if (gb_inode)
-               mutex_lock(&gb_inode->i_mutex);
+               inode_lock(gb_inode);
 
        if (o2info_coherent(&ffg->iff_req)) {
                status = ocfs2_inode_lock(gb_inode, &bh, 0);
@@ -604,7 +604,7 @@ bail:
                ocfs2_inode_unlock(gb_inode, 0);
 
        if (gb_inode)
-               mutex_unlock(&gb_inode->i_mutex);
+               inode_unlock(gb_inode);
 
        iput(gb_inode);
        brelse(bh);
index 3772a2dbb980c950279349146f14c4ec3887ead2..61b833b721d89e4184453760507996d5f208800a 100644 (file)
@@ -2088,7 +2088,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
                return status;
        }
 
-       mutex_lock(&orphan_dir_inode->i_mutex);
+       inode_lock(orphan_dir_inode);
        status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0);
        if (status < 0) {
                mlog_errno(status);
@@ -2106,7 +2106,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
 out_cluster:
        ocfs2_inode_unlock(orphan_dir_inode, 0);
 out:
-       mutex_unlock(&orphan_dir_inode->i_mutex);
+       inode_unlock(orphan_dir_inode);
        iput(orphan_dir_inode);
        return status;
 }
@@ -2196,7 +2196,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
                oi->ip_next_orphan = NULL;
 
                if (oi->ip_flags & OCFS2_INODE_DIO_ORPHAN_ENTRY) {
-                       mutex_lock(&inode->i_mutex);
+                       inode_lock(inode);
                        ret = ocfs2_rw_lock(inode, 1);
                        if (ret < 0) {
                                mlog_errno(ret);
@@ -2235,7 +2235,7 @@ unlock_inode:
 unlock_rw:
                        ocfs2_rw_unlock(inode, 1);
 unlock_mutex:
-                       mutex_unlock(&inode->i_mutex);
+                       inode_unlock(inode);
 
                        /* clear dio flag in ocfs2_inode_info */
                        oi->ip_flags &= ~OCFS2_INODE_DIO_ORPHAN_ENTRY;
index e9c99e35f5ea74e160c158ca05b01371057d24d3..7d62c43a2c3e24934965ba5df66f44e33fc0488a 100644 (file)
@@ -414,7 +414,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
                goto out;
        }
 
-       mutex_lock(&main_bm_inode->i_mutex);
+       inode_lock(main_bm_inode);
 
        status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
        if (status < 0) {
@@ -468,7 +468,7 @@ out_unlock:
        ocfs2_inode_unlock(main_bm_inode, 1);
 
 out_mutex:
-       mutex_unlock(&main_bm_inode->i_mutex);
+       inode_unlock(main_bm_inode);
        iput(main_bm_inode);
 
 out:
@@ -506,7 +506,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
                goto bail;
        }
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        status = ocfs2_read_inode_block_full(inode, &alloc_bh,
                                             OCFS2_BH_IGNORE_CACHE);
@@ -539,7 +539,7 @@ bail:
        brelse(alloc_bh);
 
        if (inode) {
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
                iput(inode);
        }
 
@@ -571,7 +571,7 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
                goto out;
        }
 
-       mutex_lock(&main_bm_inode->i_mutex);
+       inode_lock(main_bm_inode);
 
        status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
        if (status < 0) {
@@ -601,7 +601,7 @@ out_unlock:
        ocfs2_inode_unlock(main_bm_inode, 1);
 
 out_mutex:
-       mutex_unlock(&main_bm_inode->i_mutex);
+       inode_unlock(main_bm_inode);
 
        brelse(main_bm_bh);
 
@@ -643,7 +643,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
                goto bail;
        }
 
-       mutex_lock(&local_alloc_inode->i_mutex);
+       inode_lock(local_alloc_inode);
 
        /*
         * We must double check state and allocator bits because
@@ -709,7 +709,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
        status = 0;
 bail:
        if (status < 0 && local_alloc_inode) {
-               mutex_unlock(&local_alloc_inode->i_mutex);
+               inode_unlock(local_alloc_inode);
                iput(local_alloc_inode);
        }
 
index 124471d26a73f4fe79738e2b89662d82a7ad0561..e3d05d9901a3da26b123865545446f5ef96b08a5 100644 (file)
@@ -276,7 +276,7 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
         *      context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
         */
 
-       mutex_lock(&tl_inode->i_mutex);
+       inode_lock(tl_inode);
 
        if (ocfs2_truncate_log_needs_flush(osb)) {
                ret = __ocfs2_flush_truncate_log(osb);
@@ -338,7 +338,7 @@ out_commit:
        ocfs2_commit_trans(osb, handle);
 
 out_unlock_mutex:
-       mutex_unlock(&tl_inode->i_mutex);
+       inode_unlock(tl_inode);
 
        if (context->data_ac) {
                ocfs2_free_alloc_context(context->data_ac);
@@ -632,7 +632,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
                goto out;
        }
 
-       mutex_lock(&gb_inode->i_mutex);
+       inode_lock(gb_inode);
 
        ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1);
        if (ret) {
@@ -640,7 +640,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
                goto out_unlock_gb_mutex;
        }
 
-       mutex_lock(&tl_inode->i_mutex);
+       inode_lock(tl_inode);
 
        handle = ocfs2_start_trans(osb, credits);
        if (IS_ERR(handle)) {
@@ -708,11 +708,11 @@ out_commit:
        brelse(gd_bh);
 
 out_unlock_tl_inode:
-       mutex_unlock(&tl_inode->i_mutex);
+       inode_unlock(tl_inode);
 
        ocfs2_inode_unlock(gb_inode, 1);
 out_unlock_gb_mutex:
-       mutex_unlock(&gb_inode->i_mutex);
+       inode_unlock(gb_inode);
        brelse(gb_bh);
        iput(gb_inode);
 
@@ -905,7 +905,7 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
        if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
                return -EROFS;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        /*
         * This prevents concurrent writes from other nodes
@@ -969,7 +969,7 @@ out_inode_unlock:
 out_rw_unlock:
        ocfs2_rw_unlock(inode, 1);
 out:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        return status;
 }
index ab42c38031b17dccb5a872781cec7784ca2d4cfa..6b3e87189a6467fd3c72533db5c52d149ba2064d 100644 (file)
@@ -1045,7 +1045,7 @@ leave:
        if (orphan_dir) {
                /* This was locked for us in ocfs2_prepare_orphan_dir() */
                ocfs2_inode_unlock(orphan_dir, 1);
-               mutex_unlock(&orphan_dir->i_mutex);
+               inode_unlock(orphan_dir);
                iput(orphan_dir);
        }
 
@@ -1664,7 +1664,7 @@ bail:
        if (orphan_dir) {
                /* This was locked for us in ocfs2_prepare_orphan_dir() */
                ocfs2_inode_unlock(orphan_dir, 1);
-               mutex_unlock(&orphan_dir->i_mutex);
+               inode_unlock(orphan_dir);
                iput(orphan_dir);
        }
 
@@ -2121,11 +2121,11 @@ static int ocfs2_lookup_lock_orphan_dir(struct ocfs2_super *osb,
                return ret;
        }
 
-       mutex_lock(&orphan_dir_inode->i_mutex);
+       inode_lock(orphan_dir_inode);
 
        ret = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
        if (ret < 0) {
-               mutex_unlock(&orphan_dir_inode->i_mutex);
+               inode_unlock(orphan_dir_inode);
                iput(orphan_dir_inode);
 
                mlog_errno(ret);
@@ -2226,7 +2226,7 @@ out:
 
        if (ret) {
                ocfs2_inode_unlock(orphan_dir_inode, 1);
-               mutex_unlock(&orphan_dir_inode->i_mutex);
+               inode_unlock(orphan_dir_inode);
                iput(orphan_dir_inode);
        }
 
@@ -2495,7 +2495,7 @@ out:
                        ocfs2_free_alloc_context(inode_ac);
 
                /* Unroll orphan dir locking */
-               mutex_unlock(&orphan_dir->i_mutex);
+               inode_unlock(orphan_dir);
                ocfs2_inode_unlock(orphan_dir, 1);
                iput(orphan_dir);
        }
@@ -2602,7 +2602,7 @@ leave:
        if (orphan_dir) {
                /* This was locked for us in ocfs2_prepare_orphan_dir() */
                ocfs2_inode_unlock(orphan_dir, 1);
-               mutex_unlock(&orphan_dir->i_mutex);
+               inode_unlock(orphan_dir);
                iput(orphan_dir);
        }
 
@@ -2689,7 +2689,7 @@ int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
 
 bail_unlock_orphan:
        ocfs2_inode_unlock(orphan_dir_inode, 1);
-       mutex_unlock(&orphan_dir_inode->i_mutex);
+       inode_unlock(orphan_dir_inode);
        iput(orphan_dir_inode);
 
        ocfs2_free_dir_lookup_result(&orphan_insert);
@@ -2721,10 +2721,10 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
                goto bail;
        }
 
-       mutex_lock(&orphan_dir_inode->i_mutex);
+       inode_lock(orphan_dir_inode);
        status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
        if (status < 0) {
-               mutex_unlock(&orphan_dir_inode->i_mutex);
+               inode_unlock(orphan_dir_inode);
                iput(orphan_dir_inode);
                mlog_errno(status);
                goto bail;
@@ -2770,7 +2770,7 @@ bail_commit:
 
 bail_unlock_orphan:
        ocfs2_inode_unlock(orphan_dir_inode, 1);
-       mutex_unlock(&orphan_dir_inode->i_mutex);
+       inode_unlock(orphan_dir_inode);
        brelse(orphan_dir_bh);
        iput(orphan_dir_inode);
 
@@ -2834,12 +2834,12 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
                goto leave;
        }
 
-       mutex_lock(&orphan_dir_inode->i_mutex);
+       inode_lock(orphan_dir_inode);
 
        status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
        if (status < 0) {
                mlog_errno(status);
-               mutex_unlock(&orphan_dir_inode->i_mutex);
+               inode_unlock(orphan_dir_inode);
                iput(orphan_dir_inode);
                goto leave;
        }
@@ -2901,7 +2901,7 @@ out_commit:
        ocfs2_commit_trans(osb, handle);
 orphan_unlock:
        ocfs2_inode_unlock(orphan_dir_inode, 1);
-       mutex_unlock(&orphan_dir_inode->i_mutex);
+       inode_unlock(orphan_dir_inode);
        iput(orphan_dir_inode);
 leave:
 
index fde9ef18cff3c4ba59d7891ab395bc739fdbf81e..9c9dd30bc94541fa89baed2b0ac8c2a3c8a72e31 100644 (file)
@@ -308,7 +308,7 @@ int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
                WARN_ON(bh != oinfo->dqi_gqi_bh);
        spin_unlock(&dq_data_lock);
        if (ex) {
-               mutex_lock(&oinfo->dqi_gqinode->i_mutex);
+               inode_lock(oinfo->dqi_gqinode);
                down_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
        } else {
                down_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
@@ -320,7 +320,7 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
 {
        if (ex) {
                up_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
-               mutex_unlock(&oinfo->dqi_gqinode->i_mutex);
+               inode_unlock(oinfo->dqi_gqinode);
        } else {
                up_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
        }
index 252119860e6c1ae36fbfe7d5b5b84010341f8e5b..3eff031aaf264df58b74c4165749b0c2385c6312 100644 (file)
@@ -807,7 +807,7 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
                        mlog_errno(ret);
                        goto out;
                }
-               mutex_lock(&alloc_inode->i_mutex);
+               inode_lock(alloc_inode);
 
                ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1);
                if (ret) {
@@ -867,7 +867,7 @@ out_unlock:
        }
 out_mutex:
        if (alloc_inode) {
-               mutex_unlock(&alloc_inode->i_mutex);
+               inode_unlock(alloc_inode);
                iput(alloc_inode);
        }
 out:
@@ -4197,7 +4197,7 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
                goto out;
        }
 
-       mutex_lock_nested(&new_inode->i_mutex, I_MUTEX_CHILD);
+       inode_lock_nested(new_inode, I_MUTEX_CHILD);
        ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1,
                                      OI_LS_REFLINK_TARGET);
        if (ret) {
@@ -4231,7 +4231,7 @@ inode_unlock:
        ocfs2_inode_unlock(new_inode, 1);
        brelse(new_bh);
 out_unlock:
-       mutex_unlock(&new_inode->i_mutex);
+       inode_unlock(new_inode);
 out:
        if (!ret) {
                ret = filemap_fdatawait(inode->i_mapping);
@@ -4402,11 +4402,11 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
                        return error;
        }
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        error = dquot_initialize(dir);
        if (!error)
                error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        if (!error)
                fsnotify_create(dir, new_dentry);
        return error;
index 79b8021302b3eed84b9c59637cddd0e4844eb78b..576b9a04873f2f064fe3f66842713768b8f03f83 100644 (file)
@@ -301,7 +301,7 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
                goto out;
        }
 
-       mutex_lock(&main_bm_inode->i_mutex);
+       inode_lock(main_bm_inode);
 
        ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
        if (ret < 0) {
@@ -375,7 +375,7 @@ out_unlock:
        ocfs2_inode_unlock(main_bm_inode, 1);
 
 out_mutex:
-       mutex_unlock(&main_bm_inode->i_mutex);
+       inode_unlock(main_bm_inode);
        iput(main_bm_inode);
 
 out:
@@ -486,7 +486,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
                goto out;
        }
 
-       mutex_lock(&main_bm_inode->i_mutex);
+       inode_lock(main_bm_inode);
 
        ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
        if (ret < 0) {
@@ -590,7 +590,7 @@ out_unlock:
        ocfs2_inode_unlock(main_bm_inode, 1);
 
 out_mutex:
-       mutex_unlock(&main_bm_inode->i_mutex);
+       inode_unlock(main_bm_inode);
        iput(main_bm_inode);
 
 out:
index fc6d25f6d4442c79dd376da24f18c3e3355232ed..2f19aeec5482106de43b65bc6598c42fb4ed2e63 100644 (file)
@@ -141,7 +141,7 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
                if (ac->ac_which != OCFS2_AC_USE_LOCAL)
                        ocfs2_inode_unlock(inode, 1);
 
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
 
                iput(inode);
                ac->ac_inode = NULL;
@@ -797,11 +797,11 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
                return -EINVAL;
        }
 
-       mutex_lock(&alloc_inode->i_mutex);
+       inode_lock(alloc_inode);
 
        status = ocfs2_inode_lock(alloc_inode, &bh, 1);
        if (status < 0) {
-               mutex_unlock(&alloc_inode->i_mutex);
+               inode_unlock(alloc_inode);
                iput(alloc_inode);
 
                mlog_errno(status);
@@ -2875,10 +2875,10 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
                goto bail;
        }
 
-       mutex_lock(&inode_alloc_inode->i_mutex);
+       inode_lock(inode_alloc_inode);
        status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
        if (status < 0) {
-               mutex_unlock(&inode_alloc_inode->i_mutex);
+               inode_unlock(inode_alloc_inode);
                iput(inode_alloc_inode);
                mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
                     (u32)suballoc_slot, status);
@@ -2891,7 +2891,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
                mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
 
        ocfs2_inode_unlock(inode_alloc_inode, 0);
-       mutex_unlock(&inode_alloc_inode->i_mutex);
+       inode_unlock(inode_alloc_inode);
 
        iput(inode_alloc_inode);
        brelse(alloc_bh);
index f0e241ffd94fc9d9e4ffb3d9afbda6e6eb932c3b..7d3d979f57d9142169f93f2c88b27a612dc06dc4 100644 (file)
@@ -2524,7 +2524,7 @@ static int ocfs2_xattr_free_block(struct inode *inode,
                mlog_errno(ret);
                goto out;
        }
-       mutex_lock(&xb_alloc_inode->i_mutex);
+       inode_lock(xb_alloc_inode);
 
        ret = ocfs2_inode_lock(xb_alloc_inode, &xb_alloc_bh, 1);
        if (ret < 0) {
@@ -2549,7 +2549,7 @@ out_unlock:
        ocfs2_inode_unlock(xb_alloc_inode, 1);
        brelse(xb_alloc_bh);
 out_mutex:
-       mutex_unlock(&xb_alloc_inode->i_mutex);
+       inode_unlock(xb_alloc_inode);
        iput(xb_alloc_inode);
 out:
        brelse(blk_bh);
@@ -3619,17 +3619,17 @@ int ocfs2_xattr_set(struct inode *inode,
                }
        }
 
-       mutex_lock(&tl_inode->i_mutex);
+       inode_lock(tl_inode);
 
        if (ocfs2_truncate_log_needs_flush(osb)) {
                ret = __ocfs2_flush_truncate_log(osb);
                if (ret < 0) {
-                       mutex_unlock(&tl_inode->i_mutex);
+                       inode_unlock(tl_inode);
                        mlog_errno(ret);
                        goto cleanup;
                }
        }
-       mutex_unlock(&tl_inode->i_mutex);
+       inode_unlock(tl_inode);
 
        ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis,
                                        &xbs, &ctxt, ref_meta, &credits);
@@ -5460,7 +5460,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
                return ret;
        }
 
-       mutex_lock(&tl_inode->i_mutex);
+       inode_lock(tl_inode);
 
        if (ocfs2_truncate_log_needs_flush(osb)) {
                ret = __ocfs2_flush_truncate_log(osb);
@@ -5504,7 +5504,7 @@ out_commit:
 out:
        ocfs2_schedule_truncate_log_flush(osb, 1);
 
-       mutex_unlock(&tl_inode->i_mutex);
+       inode_unlock(tl_inode);
 
        if (meta_ac)
                ocfs2_free_alloc_context(meta_ac);
index b25b1542c5304a74999db5ad52af57e89133a68e..55bdc75e2172a38487ea2942176c127fdad0eb82 100644 (file)
--- a/fs/open.c
+++ b/fs/open.c
@@ -58,10 +58,10 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
        if (ret)
                newattrs.ia_valid |= ret | ATTR_FORCE;
 
-       mutex_lock(&dentry->d_inode->i_mutex);
+       inode_lock(dentry->d_inode);
        /* Note any delegations or leases have already been broken: */
        ret = notify_change(dentry, &newattrs, NULL);
-       mutex_unlock(&dentry->d_inode->i_mutex);
+       inode_unlock(dentry->d_inode);
        return ret;
 }
 
@@ -510,7 +510,7 @@ static int chmod_common(struct path *path, umode_t mode)
        if (error)
                return error;
 retry_deleg:
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        error = security_path_chmod(path, mode);
        if (error)
                goto out_unlock;
@@ -518,7 +518,7 @@ retry_deleg:
        newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
        error = notify_change(path->dentry, &newattrs, &delegated_inode);
 out_unlock:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
@@ -593,11 +593,11 @@ retry_deleg:
        if (!S_ISDIR(inode->i_mode))
                newattrs.ia_valid |=
                        ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        error = security_path_chown(path, uid, gid);
        if (!error)
                error = notify_change(path->dentry, &newattrs, &delegated_inode);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
index eff6319d50373c05d4d829b0fd83fa784d5f187b..d894e7cd9a864239ea19dc86d405239ee2405607 100644 (file)
@@ -248,9 +248,9 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
        if (err)
                goto out_cleanup;
 
-       mutex_lock(&newdentry->d_inode->i_mutex);
+       inode_lock(newdentry->d_inode);
        err = ovl_set_attr(newdentry, stat);
-       mutex_unlock(&newdentry->d_inode->i_mutex);
+       inode_unlock(newdentry->d_inode);
        if (err)
                goto out_cleanup;
 
index 692ceda3bc21f6976b65f3e2d5aa4b7ef2e9c5e8..ed95272d57a61af3f26a58c71cf789d3c94e28ce 100644 (file)
@@ -167,7 +167,7 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
        struct dentry *newdentry;
        int err;
 
-       mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT);
+       inode_lock_nested(udir, I_MUTEX_PARENT);
        newdentry = lookup_one_len(dentry->d_name.name, upperdir,
                                   dentry->d_name.len);
        err = PTR_ERR(newdentry);
@@ -185,7 +185,7 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
 out_dput:
        dput(newdentry);
 out_unlock:
-       mutex_unlock(&udir->i_mutex);
+       inode_unlock(udir);
        return err;
 }
 
@@ -258,9 +258,9 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
        if (err)
                goto out_cleanup;
 
-       mutex_lock(&opaquedir->d_inode->i_mutex);
+       inode_lock(opaquedir->d_inode);
        err = ovl_set_attr(opaquedir, &stat);
-       mutex_unlock(&opaquedir->d_inode->i_mutex);
+       inode_unlock(opaquedir->d_inode);
        if (err)
                goto out_cleanup;
 
@@ -599,7 +599,7 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
        struct dentry *upper = ovl_dentry_upper(dentry);
        int err;
 
-       mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+       inode_lock_nested(dir, I_MUTEX_PARENT);
        err = -ESTALE;
        if (upper->d_parent == upperdir) {
                /* Don't let d_delete() think it can reset d_inode */
@@ -619,7 +619,7 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
         * now.
         */
        d_drop(dentry);
-       mutex_unlock(&dir->i_mutex);
+       inode_unlock(dir);
 
        return err;
 }
index bf996e574f3d842995847f72b8ee7bb7466f2539..49e204560655a8f1737a2a03a38b7a88604a5a4c 100644 (file)
@@ -63,9 +63,9 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr)
        if (!err) {
                upperdentry = ovl_dentry_upper(dentry);
 
-               mutex_lock(&upperdentry->d_inode->i_mutex);
+               inode_lock(upperdentry->d_inode);
                err = notify_change(upperdentry, attr, NULL);
-               mutex_unlock(&upperdentry->d_inode->i_mutex);
+               inode_unlock(upperdentry->d_inode);
        }
        ovl_drop_write(dentry);
 out:
index adcb1398c48128682875b2ff4a6b42d95c1e116e..fdaf28f75e12acf1f4dba023c31017c81747744b 100644 (file)
@@ -228,7 +228,7 @@ static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
                                dput(dentry);
                        }
                }
-               mutex_unlock(&dir->d_inode->i_mutex);
+               inode_unlock(dir->d_inode);
        }
        revert_creds(old_cred);
        put_cred(override_cred);
@@ -399,7 +399,7 @@ static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
        loff_t res;
        struct ovl_dir_file *od = file->private_data;
 
-       mutex_lock(&file_inode(file)->i_mutex);
+       inode_lock(file_inode(file));
        if (!file->f_pos)
                ovl_dir_reset(file);
 
@@ -429,7 +429,7 @@ static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
                res = offset;
        }
 out_unlock:
-       mutex_unlock(&file_inode(file)->i_mutex);
+       inode_unlock(file_inode(file));
 
        return res;
 }
@@ -454,10 +454,10 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
                        ovl_path_upper(dentry, &upperpath);
                        realfile = ovl_path_open(&upperpath, O_RDONLY);
                        smp_mb__before_spinlock();
-                       mutex_lock(&inode->i_mutex);
+                       inode_lock(inode);
                        if (!od->upperfile) {
                                if (IS_ERR(realfile)) {
-                                       mutex_unlock(&inode->i_mutex);
+                                       inode_unlock(inode);
                                        return PTR_ERR(realfile);
                                }
                                od->upperfile = realfile;
@@ -467,7 +467,7 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
                                        fput(realfile);
                                realfile = od->upperfile;
                        }
-                       mutex_unlock(&inode->i_mutex);
+                       inode_unlock(inode);
                }
        }
 
@@ -479,9 +479,9 @@ static int ovl_dir_release(struct inode *inode, struct file *file)
        struct ovl_dir_file *od = file->private_data;
 
        if (od->cache) {
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
                ovl_cache_put(od, file->f_path.dentry);
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
        }
        fput(od->realfile);
        if (od->upperfile)
@@ -557,7 +557,7 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
 {
        struct ovl_cache_entry *p;
 
-       mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD);
+       inode_lock_nested(upper->d_inode, I_MUTEX_CHILD);
        list_for_each_entry(p, list, l_node) {
                struct dentry *dentry;
 
@@ -575,5 +575,5 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
                        ovl_cleanup(upper->d_inode, dentry);
                dput(dentry);
        }
-       mutex_unlock(&upper->d_inode->i_mutex);
+       inode_unlock(upper->d_inode);
 }
index d250604f985ab91abaa60e70a6a4025049016916..8d826bd56b26b10d641d7ce1f7a47b747851fac9 100644 (file)
@@ -229,7 +229,7 @@ void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry)
 {
        struct ovl_entry *oe = dentry->d_fsdata;
 
-       WARN_ON(!mutex_is_locked(&upperdentry->d_parent->d_inode->i_mutex));
+       WARN_ON(!inode_is_locked(upperdentry->d_parent->d_inode));
        WARN_ON(oe->__upperdentry);
        BUG_ON(!upperdentry->d_inode);
        /*
@@ -244,7 +244,7 @@ void ovl_dentry_version_inc(struct dentry *dentry)
 {
        struct ovl_entry *oe = dentry->d_fsdata;
 
-       WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
+       WARN_ON(!inode_is_locked(dentry->d_inode));
        oe->version++;
 }
 
@@ -252,7 +252,7 @@ u64 ovl_dentry_version_get(struct dentry *dentry)
 {
        struct ovl_entry *oe = dentry->d_fsdata;
 
-       WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
+       WARN_ON(!inode_is_locked(dentry->d_inode));
        return oe->version;
 }
 
@@ -375,9 +375,9 @@ static inline struct dentry *ovl_lookup_real(struct dentry *dir,
 {
        struct dentry *dentry;
 
-       mutex_lock(&dir->d_inode->i_mutex);
+       inode_lock(dir->d_inode);
        dentry = lookup_one_len(name->name, dir, name->len);
-       mutex_unlock(&dir->d_inode->i_mutex);
+       inode_unlock(dir->d_inode);
 
        if (IS_ERR(dentry)) {
                if (PTR_ERR(dentry) == -ENOENT)
@@ -744,7 +744,7 @@ static struct dentry *ovl_workdir_create(struct vfsmount *mnt,
        if (err)
                return ERR_PTR(err);
 
-       mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+       inode_lock_nested(dir, I_MUTEX_PARENT);
 retry:
        work = lookup_one_len(OVL_WORKDIR_NAME, dentry,
                              strlen(OVL_WORKDIR_NAME));
@@ -770,7 +770,7 @@ retry:
                        goto out_dput;
        }
 out_unlock:
-       mutex_unlock(&dir->i_mutex);
+       inode_unlock(dir);
        mnt_drop_write(mnt);
 
        return work;
index 42cf8ddf0e5599da6621fcdf741d6a61fd2f2d7a..ab8dad3ccb6a8bac13a2eab25eeb2b68791d60f5 100644 (file)
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -38,6 +38,12 @@ unsigned int pipe_max_size = 1048576;
  */
 unsigned int pipe_min_size = PAGE_SIZE;
 
+/* Maximum allocatable pages per user. Hard limit is unset by default, soft
+ * matches default values.
+ */
+unsigned long pipe_user_pages_hard;
+unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
+
 /*
  * We use a start+len construction, which provides full use of the 
  * allocated memory.
@@ -583,20 +589,49 @@ pipe_fasync(int fd, struct file *filp, int on)
        return retval;
 }
 
+static void account_pipe_buffers(struct pipe_inode_info *pipe,
+                                 unsigned long old, unsigned long new)
+{
+       atomic_long_add(new - old, &pipe->user->pipe_bufs);
+}
+
+static bool too_many_pipe_buffers_soft(struct user_struct *user)
+{
+       return pipe_user_pages_soft &&
+              atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_soft;
+}
+
+static bool too_many_pipe_buffers_hard(struct user_struct *user)
+{
+       return pipe_user_pages_hard &&
+              atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_hard;
+}
+
 struct pipe_inode_info *alloc_pipe_info(void)
 {
        struct pipe_inode_info *pipe;
 
        pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
        if (pipe) {
-               pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL);
+               unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
+               struct user_struct *user = get_current_user();
+
+               if (!too_many_pipe_buffers_hard(user)) {
+                       if (too_many_pipe_buffers_soft(user))
+                               pipe_bufs = 1;
+                       pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * pipe_bufs, GFP_KERNEL);
+               }
+
                if (pipe->bufs) {
                        init_waitqueue_head(&pipe->wait);
                        pipe->r_counter = pipe->w_counter = 1;
-                       pipe->buffers = PIPE_DEF_BUFFERS;
+                       pipe->buffers = pipe_bufs;
+                       pipe->user = user;
+                       account_pipe_buffers(pipe, 0, pipe_bufs);
                        mutex_init(&pipe->mutex);
                        return pipe;
                }
+               free_uid(user);
                kfree(pipe);
        }
 
@@ -607,6 +642,8 @@ void free_pipe_info(struct pipe_inode_info *pipe)
 {
        int i;
 
+       account_pipe_buffers(pipe, pipe->buffers, 0);
+       free_uid(pipe->user);
        for (i = 0; i < pipe->buffers; i++) {
                struct pipe_buffer *buf = pipe->bufs + i;
                if (buf->ops)
@@ -998,6 +1035,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
                        memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer));
        }
 
+       account_pipe_buffers(pipe, pipe->buffers, nr_pages);
        pipe->curbuf = 0;
        kfree(pipe->bufs);
        pipe->bufs = bufs;
@@ -1069,6 +1107,11 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
                if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) {
                        ret = -EPERM;
                        goto out;
+               } else if ((too_many_pipe_buffers_hard(pipe->user) ||
+                           too_many_pipe_buffers_soft(pipe->user)) &&
+                          !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
+                       ret = -EPERM;
+                       goto out;
                }
                ret = pipe_set_size(pipe, nr_pages);
                break;
index 92e6726f6e3732573bd9a64f3b313cc3508ce519..a939f5ed7f89ccb39673fef11beb542e5bef8159 100644 (file)
@@ -552,9 +552,9 @@ static int open_kcore(struct inode *inode, struct file *filp)
        if (kcore_need_update)
                kcore_update_ram();
        if (i_size_read(inode) != proc_root_kcore->size) {
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
                i_size_write(inode, proc_root_kcore->size);
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
        }
        return 0;
 }
index 67e8db442cf03802c7758a76dd8adf79b46a3a50..b6a8d3529fea9714fb25d063e9505b198201e7ab 100644 (file)
@@ -50,7 +50,7 @@ int proc_setup_self(struct super_block *s)
        struct pid_namespace *ns = s->s_fs_info;
        struct dentry *self;
        
-       mutex_lock(&root_inode->i_mutex);
+       inode_lock(root_inode);
        self = d_alloc_name(s->s_root, "self");
        if (self) {
                struct inode *inode = new_inode_pseudo(s);
@@ -69,7 +69,7 @@ int proc_setup_self(struct super_block *s)
        } else {
                self = ERR_PTR(-ENOMEM);
        }
-       mutex_unlock(&root_inode->i_mutex);
+       inode_unlock(root_inode);
        if (IS_ERR(self)) {
                pr_err("proc_fill_super: can't allocate /proc/self\n");
                return PTR_ERR(self);
index 71ffc91060f6d6ad32081bfa53d2a3c7b3706e53..85d16c67c33eaa8b7a5ac65e38ec465814916619 100644 (file)
@@ -602,7 +602,8 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
        pte_t *pte;
        spinlock_t *ptl;
 
-       if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
+       ptl = pmd_trans_huge_lock(pmd, vma);
+       if (ptl) {
                smaps_pmd_entry(pmd, addr, walk);
                spin_unlock(ptl);
                return 0;
@@ -913,7 +914,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
        spinlock_t *ptl;
        struct page *page;
 
-       if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
+       ptl = pmd_trans_huge_lock(pmd, vma);
+       if (ptl) {
                if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
                        clear_soft_dirty_pmd(vma, addr, pmd);
                        goto out;
@@ -1187,7 +1189,8 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
        int err = 0;
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       if (pmd_trans_huge_lock(pmdp, vma, &ptl)) {
+       ptl = pmd_trans_huge_lock(pmdp, vma);
+       if (ptl) {
                u64 flags = 0, frame = 0;
                pmd_t pmd = *pmdp;
 
@@ -1519,7 +1522,8 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
        pte_t *orig_pte;
        pte_t *pte;
 
-       if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
+       ptl = pmd_trans_huge_lock(pmd, vma);
+       if (ptl) {
                pte_t huge_pte = *(pte_t *)pmd;
                struct page *page;
 
index 9eacd59e0360f1367a084a1e7bfa5dca0e3fe170..e58a31e8fb2a186aaa2d3dee62811f090e2ade57 100644 (file)
@@ -52,7 +52,7 @@ int proc_setup_thread_self(struct super_block *s)
        struct pid_namespace *ns = s->s_fs_info;
        struct dentry *thread_self;
 
-       mutex_lock(&root_inode->i_mutex);
+       inode_lock(root_inode);
        thread_self = d_alloc_name(s->s_root, "thread-self");
        if (thread_self) {
                struct inode *inode = new_inode_pseudo(s);
@@ -71,7 +71,7 @@ int proc_setup_thread_self(struct super_block *s)
        } else {
                thread_self = ERR_PTR(-ENOMEM);
        }
-       mutex_unlock(&root_inode->i_mutex);
+       inode_unlock(root_inode);
        if (IS_ERR(thread_self)) {
                pr_err("proc_fill_super: can't allocate /proc/thread_self\n");
                return PTR_ERR(thread_self);
index d8c439d813ce1bd0a415fd09a8407c6c6192d49b..dc645b66cd79aea96ea723f8b127a16f7d6f70cd 100644 (file)
@@ -377,7 +377,7 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
                break;
        }
 
-       mutex_lock(&d_inode(root)->i_mutex);
+       inode_lock(d_inode(root));
 
        dentry = d_alloc_name(root, name);
        if (!dentry)
@@ -397,12 +397,12 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
        list_add(&private->list, &allpstore);
        spin_unlock_irqrestore(&allpstore_lock, flags);
 
-       mutex_unlock(&d_inode(root)->i_mutex);
+       inode_unlock(d_inode(root));
 
        return 0;
 
 fail_lockedalloc:
-       mutex_unlock(&d_inode(root)->i_mutex);
+       inode_unlock(d_inode(root));
        kfree(private);
 fail_alloc:
        iput(inode);
index fbd70af98820752ca7126363c4597de3d781b14d..3c3b81bb6dfebbd3d6c3f1f8f1252e46308bf9c9 100644 (file)
@@ -682,9 +682,9 @@ int dquot_quota_sync(struct super_block *sb, int type)
                        continue;
                if (!sb_has_quota_active(sb, cnt))
                        continue;
-               mutex_lock(&dqopt->files[cnt]->i_mutex);
+               inode_lock(dqopt->files[cnt]);
                truncate_inode_pages(&dqopt->files[cnt]->i_data, 0);
-               mutex_unlock(&dqopt->files[cnt]->i_mutex);
+               inode_unlock(dqopt->files[cnt]);
        }
        mutex_unlock(&dqopt->dqonoff_mutex);
 
@@ -2162,12 +2162,12 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags)
                        /* If quota was reenabled in the meantime, we have
                         * nothing to do */
                        if (!sb_has_quota_loaded(sb, cnt)) {
-                               mutex_lock(&toputinode[cnt]->i_mutex);
+                               inode_lock(toputinode[cnt]);
                                toputinode[cnt]->i_flags &= ~(S_IMMUTABLE |
                                  S_NOATIME | S_NOQUOTA);
                                truncate_inode_pages(&toputinode[cnt]->i_data,
                                                     0);
-                               mutex_unlock(&toputinode[cnt]->i_mutex);
+                               inode_unlock(toputinode[cnt]);
                                mark_inode_dirty_sync(toputinode[cnt]);
                        }
                        mutex_unlock(&dqopt->dqonoff_mutex);
@@ -2258,11 +2258,11 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
                /* We don't want quota and atime on quota files (deadlocks
                 * possible) Also nobody should write to the file - we use
                 * special IO operations which ignore the immutable bit. */
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
                oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE |
                                             S_NOQUOTA);
                inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
                /*
                 * When S_NOQUOTA is set, remove dquot references as no more
                 * references can be added
@@ -2305,12 +2305,12 @@ out_file_init:
        iput(inode);
 out_lock:
        if (oldflags != -1) {
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
                /* Set the flags back (in the case of accidental quotaon()
                 * on a wrong file we don't want to mess up the flags) */
                inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE);
                inode->i_flags |= oldflags;
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
        }
        mutex_unlock(&dqopt->dqonoff_mutex);
 out_fmt:
@@ -2430,9 +2430,9 @@ int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
        struct dentry *dentry;
        int error;
 
-       mutex_lock(&d_inode(sb->s_root)->i_mutex);
+       inode_lock(d_inode(sb->s_root));
        dentry = lookup_one_len(qf_name, sb->s_root, strlen(qf_name));
-       mutex_unlock(&d_inode(sb->s_root)->i_mutex);
+       inode_unlock(d_inode(sb->s_root));
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
 
index 06b07d5a08fec4be6d83e9c92192963da6c8bc5c..324ec271cc4e64868c34e3ff2f28ac2c0542475e 100644 (file)
@@ -238,7 +238,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int whence)
        struct inode *inode = file_inode(file);
        loff_t retval;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        switch (whence) {
                case SEEK_END:
                        offset += i_size_read(inode);
@@ -283,7 +283,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int whence)
                retval = offset;
        }
 out:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return retval;
 }
 EXPORT_SYMBOL(default_llseek);
@@ -1656,6 +1656,9 @@ next_file:
                mnt_drop_write_file(dst_file);
 next_loop:
                fdput(dst_fd);
+
+               if (fatal_signal_pending(current))
+                       goto out;
        }
 
 out:
index ced679179cac0686407c3743cff177289bfc3959..e69ef3b79787b9fb4f02ab65b35c28571c9df1a0 100644 (file)
@@ -44,7 +44,7 @@ int iterate_dir(struct file *file, struct dir_context *ctx)
                fsnotify_access(file);
                file_accessed(file);
        }
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 out:
        return res;
 }
index 4a024e2ceb9f76fbf9156d0dc84c3e09a4a1ff15..3abd4004184ba0b49ddc91dac59795230125e133 100644 (file)
@@ -38,11 +38,11 @@ static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
        if (err)
                return err;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        reiserfs_write_lock(inode->i_sb);
        err = reiserfs_commit_for_inode(inode);
        reiserfs_write_unlock(inode->i_sb);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        if (err < 0)
                return err;
        return 0;
index 96a1bcf33db4435098baa153f807a71274543aaf..9424a4ba93a9504b12c75c08d0f4a4cd8f49fe20 100644 (file)
@@ -158,7 +158,7 @@ static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end,
        if (err)
                return err;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        BUG_ON(!S_ISREG(inode->i_mode));
        err = sync_mapping_buffers(inode->i_mapping);
        reiserfs_write_lock(inode->i_sb);
@@ -166,7 +166,7 @@ static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end,
        reiserfs_write_unlock(inode->i_sb);
        if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        if (barrier_done < 0)
                return barrier_done;
        return (err < 0) ? -EIO : 0;
index 6ec8a30a0911b953e0c79ddcbd1458c8fc13142a..036a1fc0a8c35655a2bdb6cfb5b79dfccca04d54 100644 (file)
@@ -224,7 +224,7 @@ out_unlock:
        page_cache_release(page);
 
 out:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        reiserfs_write_unlock(inode->i_sb);
        return retval;
 }
index 05db7473bcb5c64078dc572d2ef8e455a770dbba..c0306ec8ed7b8f5d4eef97fd1cf4c7653c2b1f20 100644 (file)
@@ -288,7 +288,7 @@ static int finish_unfinished(struct super_block *s)
                pathrelse(&path);
 
                inode = reiserfs_iget(s, &obj_key);
-               if (!inode) {
+               if (IS_ERR_OR_NULL(inode)) {
                        /*
                         * the unlink almost completed, it just did not
                         * manage to remove "save" link and release objectid
index e5ddb4e5ea9497956cfa7b04dca84eba790829d1..57e0b23105327b298d17db42d44c2ebf8bd31715 100644 (file)
 #ifdef CONFIG_REISERFS_FS_XATTR
 static int xattr_create(struct inode *dir, struct dentry *dentry, int mode)
 {
-       BUG_ON(!mutex_is_locked(&dir->i_mutex));
+       BUG_ON(!inode_is_locked(dir));
        return dir->i_op->create(dir, dentry, mode, true);
 }
 #endif
 
 static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
-       BUG_ON(!mutex_is_locked(&dir->i_mutex));
+       BUG_ON(!inode_is_locked(dir));
        return dir->i_op->mkdir(dir, dentry, mode);
 }
 
@@ -85,11 +85,11 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry)
 {
        int error;
 
-       BUG_ON(!mutex_is_locked(&dir->i_mutex));
+       BUG_ON(!inode_is_locked(dir));
 
-       mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD);
+       inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD);
        error = dir->i_op->unlink(dir, dentry);
-       mutex_unlock(&d_inode(dentry)->i_mutex);
+       inode_unlock(d_inode(dentry));
 
        if (!error)
                d_delete(dentry);
@@ -100,13 +100,13 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
 {
        int error;
 
-       BUG_ON(!mutex_is_locked(&dir->i_mutex));
+       BUG_ON(!inode_is_locked(dir));
 
-       mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD);
+       inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD);
        error = dir->i_op->rmdir(dir, dentry);
        if (!error)
                d_inode(dentry)->i_flags |= S_DEAD;
-       mutex_unlock(&d_inode(dentry)->i_mutex);
+       inode_unlock(d_inode(dentry));
        if (!error)
                d_delete(dentry);
 
@@ -123,7 +123,7 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags)
        if (d_really_is_negative(privroot))
                return ERR_PTR(-ENODATA);
 
-       mutex_lock_nested(&d_inode(privroot)->i_mutex, I_MUTEX_XATTR);
+       inode_lock_nested(d_inode(privroot), I_MUTEX_XATTR);
 
        xaroot = dget(REISERFS_SB(sb)->xattr_root);
        if (!xaroot)
@@ -139,7 +139,7 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags)
                }
        }
 
-       mutex_unlock(&d_inode(privroot)->i_mutex);
+       inode_unlock(d_inode(privroot));
        return xaroot;
 }
 
@@ -156,7 +156,7 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags)
                 le32_to_cpu(INODE_PKEY(inode)->k_objectid),
                 inode->i_generation);
 
-       mutex_lock_nested(&d_inode(xaroot)->i_mutex, I_MUTEX_XATTR);
+       inode_lock_nested(d_inode(xaroot), I_MUTEX_XATTR);
 
        xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf));
        if (!IS_ERR(xadir) && d_really_is_negative(xadir)) {
@@ -170,7 +170,7 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags)
                }
        }
 
-       mutex_unlock(&d_inode(xaroot)->i_mutex);
+       inode_unlock(d_inode(xaroot));
        dput(xaroot);
        return xadir;
 }
@@ -195,7 +195,7 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen,
                container_of(ctx, struct reiserfs_dentry_buf, ctx);
        struct dentry *dentry;
 
-       WARN_ON_ONCE(!mutex_is_locked(&d_inode(dbuf->xadir)->i_mutex));
+       WARN_ON_ONCE(!inode_is_locked(d_inode(dbuf->xadir)));
 
        if (dbuf->count == ARRAY_SIZE(dbuf->dentries))
                return -ENOSPC;
@@ -254,7 +254,7 @@ static int reiserfs_for_each_xattr(struct inode *inode,
                goto out_dir;
        }
 
-       mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_XATTR);
+       inode_lock_nested(d_inode(dir), I_MUTEX_XATTR);
 
        buf.xadir = dir;
        while (1) {
@@ -276,7 +276,7 @@ static int reiserfs_for_each_xattr(struct inode *inode,
                        break;
                buf.count = 0;
        }
-       mutex_unlock(&d_inode(dir)->i_mutex);
+       inode_unlock(d_inode(dir));
 
        cleanup_dentry_buf(&buf);
 
@@ -298,13 +298,13 @@ static int reiserfs_for_each_xattr(struct inode *inode,
                if (!err) {
                        int jerror;
 
-                       mutex_lock_nested(&d_inode(dir->d_parent)->i_mutex,
+                       inode_lock_nested(d_inode(dir->d_parent),
                                          I_MUTEX_XATTR);
                        err = action(dir, data);
                        reiserfs_write_lock(inode->i_sb);
                        jerror = journal_end(&th);
                        reiserfs_write_unlock(inode->i_sb);
-                       mutex_unlock(&d_inode(dir->d_parent)->i_mutex);
+                       inode_unlock(d_inode(dir->d_parent));
                        err = jerror ?: err;
                }
        }
@@ -384,7 +384,7 @@ static struct dentry *xattr_lookup(struct inode *inode, const char *name,
        if (IS_ERR(xadir))
                return ERR_CAST(xadir);
 
-       mutex_lock_nested(&d_inode(xadir)->i_mutex, I_MUTEX_XATTR);
+       inode_lock_nested(d_inode(xadir), I_MUTEX_XATTR);
        xafile = lookup_one_len(name, xadir, strlen(name));
        if (IS_ERR(xafile)) {
                err = PTR_ERR(xafile);
@@ -404,7 +404,7 @@ static struct dentry *xattr_lookup(struct inode *inode, const char *name,
        if (err)
                dput(xafile);
 out:
-       mutex_unlock(&d_inode(xadir)->i_mutex);
+       inode_unlock(d_inode(xadir));
        dput(xadir);
        if (err)
                return ERR_PTR(err);
@@ -469,7 +469,7 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name)
        if (IS_ERR(xadir))
                return PTR_ERR(xadir);
 
-       mutex_lock_nested(&d_inode(xadir)->i_mutex, I_MUTEX_XATTR);
+       inode_lock_nested(d_inode(xadir), I_MUTEX_XATTR);
        dentry = lookup_one_len(name, xadir, strlen(name));
        if (IS_ERR(dentry)) {
                err = PTR_ERR(dentry);
@@ -483,7 +483,7 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name)
 
        dput(dentry);
 out_dput:
-       mutex_unlock(&d_inode(xadir)->i_mutex);
+       inode_unlock(d_inode(xadir));
        dput(xadir);
        return err;
 }
@@ -580,11 +580,11 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
                        .ia_valid = ATTR_SIZE | ATTR_CTIME,
                };
 
-               mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_XATTR);
+               inode_lock_nested(d_inode(dentry), I_MUTEX_XATTR);
                inode_dio_wait(d_inode(dentry));
 
                err = reiserfs_setattr(dentry, &newattrs);
-               mutex_unlock(&d_inode(dentry)->i_mutex);
+               inode_unlock(d_inode(dentry));
        } else
                update_ctime(inode);
 out_unlock:
@@ -888,9 +888,9 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
                goto out;
        }
 
-       mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_XATTR);
+       inode_lock_nested(d_inode(dir), I_MUTEX_XATTR);
        err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx);
-       mutex_unlock(&d_inode(dir)->i_mutex);
+       inode_unlock(d_inode(dir));
 
        if (!err)
                err = buf.pos;
@@ -905,7 +905,7 @@ static int create_privroot(struct dentry *dentry)
        int err;
        struct inode *inode = d_inode(dentry->d_parent);
 
-       WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex));
+       WARN_ON_ONCE(!inode_is_locked(inode));
 
        err = xattr_mkdir(inode, dentry, 0700);
        if (err || d_really_is_negative(dentry)) {
@@ -995,7 +995,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
        int err = 0;
 
        /* If we don't have the privroot located yet - go find it */
-       mutex_lock(&d_inode(s->s_root)->i_mutex);
+       inode_lock(d_inode(s->s_root));
        dentry = lookup_one_len(PRIVROOT_NAME, s->s_root,
                                strlen(PRIVROOT_NAME));
        if (!IS_ERR(dentry)) {
@@ -1005,7 +1005,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
                        d_inode(dentry)->i_flags |= S_PRIVATE;
        } else
                err = PTR_ERR(dentry);
-       mutex_unlock(&d_inode(s->s_root)->i_mutex);
+       inode_unlock(d_inode(s->s_root));
 
        return err;
 }
@@ -1025,14 +1025,14 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
                goto error;
 
        if (d_really_is_negative(privroot) && !(mount_flags & MS_RDONLY)) {
-               mutex_lock(&d_inode(s->s_root)->i_mutex);
+               inode_lock(d_inode(s->s_root));
                err = create_privroot(REISERFS_SB(s)->priv_root);
-               mutex_unlock(&d_inode(s->s_root)->i_mutex);
+               inode_unlock(d_inode(s->s_root));
        }
 
        if (d_really_is_positive(privroot)) {
                s->s_xattr = reiserfs_xattr_handlers;
-               mutex_lock(&d_inode(privroot)->i_mutex);
+               inode_lock(d_inode(privroot));
                if (!REISERFS_SB(s)->xattr_root) {
                        struct dentry *dentry;
 
@@ -1043,7 +1043,7 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
                        else
                                err = PTR_ERR(dentry);
                }
-               mutex_unlock(&d_inode(privroot)->i_mutex);
+               inode_unlock(d_inode(privroot));
        }
 
 error:
index c66f2423e1f5c511b0202cfdab46a5e2be53d7f8..4a0e48f9210483adf6b9bee36f220e80d65a09cd 100644 (file)
@@ -84,9 +84,9 @@ static int tracefs_syscall_mkdir(struct inode *inode, struct dentry *dentry, umo
         * the files within the tracefs system. It is up to the individual
         * mkdir routine to handle races.
         */
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        ret = tracefs_ops.mkdir(name);
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        kfree(name);
 
@@ -109,13 +109,13 @@ static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry)
         * This time we need to unlock not only the parent (inode) but
         * also the directory that is being deleted.
         */
-       mutex_unlock(&inode->i_mutex);
-       mutex_unlock(&dentry->d_inode->i_mutex);
+       inode_unlock(inode);
+       inode_unlock(dentry->d_inode);
 
        ret = tracefs_ops.rmdir(name);
 
-       mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
-       mutex_lock(&dentry->d_inode->i_mutex);
+       inode_lock_nested(inode, I_MUTEX_PARENT);
+       inode_lock(dentry->d_inode);
 
        kfree(name);
 
@@ -334,7 +334,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
        if (!parent)
                parent = tracefs_mount->mnt_root;
 
-       mutex_lock(&parent->d_inode->i_mutex);
+       inode_lock(parent->d_inode);
        dentry = lookup_one_len(name, parent, strlen(name));
        if (!IS_ERR(dentry) && dentry->d_inode) {
                dput(dentry);
@@ -342,7 +342,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
        }
 
        if (IS_ERR(dentry)) {
-               mutex_unlock(&parent->d_inode->i_mutex);
+               inode_unlock(parent->d_inode);
                simple_release_fs(&tracefs_mount, &tracefs_mount_count);
        }
 
@@ -351,7 +351,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
 
 static struct dentry *failed_creating(struct dentry *dentry)
 {
-       mutex_unlock(&dentry->d_parent->d_inode->i_mutex);
+       inode_unlock(dentry->d_parent->d_inode);
        dput(dentry);
        simple_release_fs(&tracefs_mount, &tracefs_mount_count);
        return NULL;
@@ -359,7 +359,7 @@ static struct dentry *failed_creating(struct dentry *dentry)
 
 static struct dentry *end_creating(struct dentry *dentry)
 {
-       mutex_unlock(&dentry->d_parent->d_inode->i_mutex);
+       inode_unlock(dentry->d_parent->d_inode);
        return dentry;
 }
 
@@ -544,9 +544,9 @@ void tracefs_remove(struct dentry *dentry)
        if (!parent || !parent->d_inode)
                return;
 
-       mutex_lock(&parent->d_inode->i_mutex);
+       inode_lock(parent->d_inode);
        ret = __tracefs_remove(dentry, parent);
-       mutex_unlock(&parent->d_inode->i_mutex);
+       inode_unlock(parent->d_inode);
        if (!ret)
                simple_release_fs(&tracefs_mount, &tracefs_mount_count);
 }
@@ -572,7 +572,7 @@ void tracefs_remove_recursive(struct dentry *dentry)
 
        parent = dentry;
  down:
-       mutex_lock(&parent->d_inode->i_mutex);
+       inode_lock(parent->d_inode);
  loop:
        /*
         * The parent->d_subdirs is protected by the d_lock. Outside that
@@ -587,7 +587,7 @@ void tracefs_remove_recursive(struct dentry *dentry)
                /* perhaps simple_empty(child) makes more sense */
                if (!list_empty(&child->d_subdirs)) {
                        spin_unlock(&parent->d_lock);
-                       mutex_unlock(&parent->d_inode->i_mutex);
+                       inode_unlock(parent->d_inode);
                        parent = child;
                        goto down;
                }
@@ -608,10 +608,10 @@ void tracefs_remove_recursive(struct dentry *dentry)
        }
        spin_unlock(&parent->d_lock);
 
-       mutex_unlock(&parent->d_inode->i_mutex);
+       inode_unlock(parent->d_inode);
        child = parent;
        parent = parent->d_parent;
-       mutex_lock(&parent->d_inode->i_mutex);
+       inode_lock(parent->d_inode);
 
        if (child != dentry)
                /* go up */
@@ -619,7 +619,7 @@ void tracefs_remove_recursive(struct dentry *dentry)
 
        if (!__tracefs_remove(child, parent))
                simple_release_fs(&tracefs_mount, &tracefs_mount_count);
-       mutex_unlock(&parent->d_inode->i_mutex);
+       inode_unlock(parent->d_inode);
 }
 
 /**
index e49bd2808bf3e397b9b499fe4d6036ad7600a71b..795992a8321e9e0c9531c92f97c91cc10835cc02 100644 (file)
@@ -515,8 +515,8 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
        dbg_gen("dent '%pd' to ino %lu (nlink %d) in dir ino %lu",
                dentry, inode->i_ino,
                inode->i_nlink, dir->i_ino);
-       ubifs_assert(mutex_is_locked(&dir->i_mutex));
-       ubifs_assert(mutex_is_locked(&inode->i_mutex));
+       ubifs_assert(inode_is_locked(dir));
+       ubifs_assert(inode_is_locked(inode));
 
        err = dbg_check_synced_i_size(c, inode);
        if (err)
@@ -572,8 +572,8 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
        dbg_gen("dent '%pd' from ino %lu (nlink %d) in dir ino %lu",
                dentry, inode->i_ino,
                inode->i_nlink, dir->i_ino);
-       ubifs_assert(mutex_is_locked(&dir->i_mutex));
-       ubifs_assert(mutex_is_locked(&inode->i_mutex));
+       ubifs_assert(inode_is_locked(dir));
+       ubifs_assert(inode_is_locked(inode));
        err = dbg_check_synced_i_size(c, inode);
        if (err)
                return err;
@@ -661,8 +661,8 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
 
        dbg_gen("directory '%pd', ino %lu in dir ino %lu", dentry,
                inode->i_ino, dir->i_ino);
-       ubifs_assert(mutex_is_locked(&dir->i_mutex));
-       ubifs_assert(mutex_is_locked(&inode->i_mutex));
+       ubifs_assert(inode_is_locked(dir));
+       ubifs_assert(inode_is_locked(inode));
        err = check_dir_empty(c, d_inode(dentry));
        if (err)
                return err;
@@ -996,10 +996,10 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
        dbg_gen("dent '%pd' ino %lu in dir ino %lu to dent '%pd' in dir ino %lu",
                old_dentry, old_inode->i_ino, old_dir->i_ino,
                new_dentry, new_dir->i_ino);
-       ubifs_assert(mutex_is_locked(&old_dir->i_mutex));
-       ubifs_assert(mutex_is_locked(&new_dir->i_mutex));
+       ubifs_assert(inode_is_locked(old_dir));
+       ubifs_assert(inode_is_locked(new_dir));
        if (unlink)
-               ubifs_assert(mutex_is_locked(&new_inode->i_mutex));
+               ubifs_assert(inode_is_locked(new_inode));
 
 
        if (unlink && is_dir) {
index eff62801acbf10524e31f8ad7816c160e6f88a90..065c88f8e4b8c2d689e8fe9ce13122b5465e8f63 100644 (file)
@@ -1317,7 +1317,7 @@ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
        err = filemap_write_and_wait_range(inode->i_mapping, start, end);
        if (err)
                return err;
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        /* Synchronize the inode unless this is a 'datasync()' call. */
        if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) {
@@ -1332,7 +1332,7 @@ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
         */
        err = ubifs_sync_wbufs_by_inode(c, inode);
 out:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return err;
 }
 
index e53292d0c21bcd9d57c3f219d3f7cc31d5eb836e..c7f4d434d098fb7ac4a35df55e2c60adb179d4c0 100644 (file)
@@ -313,7 +313,7 @@ static int setxattr(struct inode *host, const char *name, const void *value,
        union ubifs_key key;
        int err, type;
 
-       ubifs_assert(mutex_is_locked(&host->i_mutex));
+       ubifs_assert(inode_is_locked(host));
 
        if (size > UBIFS_MAX_INO_DATA)
                return -ERANGE;
@@ -550,7 +550,7 @@ int ubifs_removexattr(struct dentry *dentry, const char *name)
 
        dbg_gen("xattr '%s', ino %lu ('%pd')", name,
                host->i_ino, dentry);
-       ubifs_assert(mutex_is_locked(&host->i_mutex));
+       ubifs_assert(inode_is_locked(host));
 
        err = check_namespace(&nm);
        if (err < 0)
index bddf3d071daec536fe5a3e775b2e041e0df5a620..1af98963d860f0e4ed2959d256ff53737d070fcb 100644 (file)
@@ -122,7 +122,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
        struct udf_inode_info *iinfo = UDF_I(inode);
        int err;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        retval = generic_write_checks(iocb, from);
        if (retval <= 0)
@@ -136,7 +136,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
                                (udf_file_entry_alloc_offset(inode) + end)) {
                        err = udf_expand_file_adinicb(inode);
                        if (err) {
-                               mutex_unlock(&inode->i_mutex);
+                               inode_unlock(inode);
                                udf_debug("udf_expand_adinicb: err=%d\n", err);
                                return err;
                        }
@@ -149,7 +149,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
        retval = __generic_file_write_iter(iocb, from);
 out:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        if (retval > 0) {
                mark_inode_dirty(inode);
@@ -223,12 +223,12 @@ static int udf_release_file(struct inode *inode, struct file *filp)
                 * Grab i_mutex to avoid races with writes changing i_size
                 * while we are running.
                 */
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
                down_write(&UDF_I(inode)->i_data_sem);
                udf_discard_prealloc(inode);
                udf_truncate_tail_extent(inode);
                up_write(&UDF_I(inode)->i_data_sem);
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
        }
        return 0;
 }
index 87dc16d155723eefb61b2a28c906287d4bf11222..166d3ed32c39a54b48c4ec93e2297de3d9efdcc2 100644 (file)
@@ -262,7 +262,7 @@ int udf_expand_file_adinicb(struct inode *inode)
                .nr_to_write = 1,
        };
 
-       WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex));
+       WARN_ON_ONCE(!inode_is_locked(inode));
        if (!iinfo->i_lenAlloc) {
                if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
                        iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
index 0fbb4c7c72e83701bb32ef066908a33b435900ae..a522c15a0bfd7e5e9e0d758bfbc59185a8cb2609 100644 (file)
@@ -279,17 +279,12 @@ static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
 {
        int i;
        int nr_groups = bitmap->s_nr_groups;
-       int size = sizeof(struct udf_bitmap) + (sizeof(struct buffer_head *) *
-                                               nr_groups);
 
        for (i = 0; i < nr_groups; i++)
                if (bitmap->s_block_bitmap[i])
                        brelse(bitmap->s_block_bitmap[i]);
 
-       if (size <= PAGE_SIZE)
-               kfree(bitmap);
-       else
-               vfree(bitmap);
+       kvfree(bitmap);
 }
 
 static void udf_free_partition(struct udf_part_map *map)
index aa138d64560a6a3c2133bc70d57e367cc8c1476d..85c40f4f373d56b3d5d41ed2fe7459759c8a3aeb 100644 (file)
@@ -103,9 +103,9 @@ static int utimes_common(struct path *path, struct timespec *times)
                }
        }
 retry_deleg:
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        error = notify_change(path->dentry, &newattrs, &delegated_inode);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
index d5dd6c8b82a712085c105ec1c30f827c639ff614..07d0e47f6a7f1968782b2b31be478a6dd3851656 100644 (file)
@@ -129,7 +129,7 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
        if (error)
                return error;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        error = security_inode_setxattr(dentry, name, value, size, flags);
        if (error)
                goto out;
@@ -137,7 +137,7 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
        error = __vfs_setxattr_noperm(dentry, name, value, size, flags);
 
 out:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return error;
 }
 EXPORT_SYMBOL_GPL(vfs_setxattr);
@@ -277,7 +277,7 @@ vfs_removexattr(struct dentry *dentry, const char *name)
        if (error)
                return error;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        error = security_inode_removexattr(dentry, name);
        if (error)
                goto out;
@@ -290,7 +290,7 @@ vfs_removexattr(struct dentry *dentry, const char *name)
        }
 
 out:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return error;
 }
 EXPORT_SYMBOL_GPL(vfs_removexattr);
index e2536bb1c760036af5efff9e13437a26b179da27..dc97eb21af071b0e040fe0d74b8ba51214e28cf9 100644 (file)
@@ -984,8 +984,6 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
 
 /*
  * Values for di_flags
- * There should be a one-to-one correspondence between these flags and the
- * XFS_XFLAG_s.
  */
 #define XFS_DIFLAG_REALTIME_BIT  0     /* file's blocks come from rt area */
 #define XFS_DIFLAG_PREALLOC_BIT  1     /* file space has been preallocated */
@@ -1025,6 +1023,15 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
         XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \
         XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM)
 
+/*
+ * Values for di_flags2 These start by being exposed to userspace in the upper
+ * 16 bits of the XFS_XFLAG_s range.
+ */
+#define XFS_DIFLAG2_DAX_BIT    0       /* use DAX for this inode */
+#define XFS_DIFLAG2_DAX                (1 << XFS_DIFLAG2_DAX_BIT)
+
+#define XFS_DIFLAG2_ANY                (XFS_DIFLAG2_DAX)
+
 /*
  * Inode number format:
  * low inopblog bits - offset in block
index b2b73a998d425d04779262ead61d02d65b153bc8..fffe3d01bd9fb5c771a4919edee1c9f615ab49e2 100644 (file)
@@ -35,40 +35,6 @@ struct dioattr {
 };
 #endif
 
-/*
- * Structure for XFS_IOC_FSGETXATTR[A] and XFS_IOC_FSSETXATTR.
- */
-#ifndef HAVE_FSXATTR
-struct fsxattr {
-       __u32           fsx_xflags;     /* xflags field value (get/set) */
-       __u32           fsx_extsize;    /* extsize field value (get/set)*/
-       __u32           fsx_nextents;   /* nextents field value (get)   */
-       __u32           fsx_projid;     /* project identifier (get/set) */
-       unsigned char   fsx_pad[12];
-};
-#endif
-
-/*
- * Flags for the bs_xflags/fsx_xflags field
- * There should be a one-to-one correspondence between these flags and the
- * XFS_DIFLAG_s.
- */
-#define XFS_XFLAG_REALTIME     0x00000001      /* data in realtime volume */
-#define XFS_XFLAG_PREALLOC     0x00000002      /* preallocated file extents */
-#define XFS_XFLAG_IMMUTABLE    0x00000008      /* file cannot be modified */
-#define XFS_XFLAG_APPEND       0x00000010      /* all writes append */
-#define XFS_XFLAG_SYNC         0x00000020      /* all writes synchronous */
-#define XFS_XFLAG_NOATIME      0x00000040      /* do not update access time */
-#define XFS_XFLAG_NODUMP       0x00000080      /* do not include in backups */
-#define XFS_XFLAG_RTINHERIT    0x00000100      /* create with rt bit set */
-#define XFS_XFLAG_PROJINHERIT  0x00000200      /* create with parents projid */
-#define XFS_XFLAG_NOSYMLINKS   0x00000400      /* disallow symlink creation */
-#define XFS_XFLAG_EXTSIZE      0x00000800      /* extent size allocator hint */
-#define XFS_XFLAG_EXTSZINHERIT 0x00001000      /* inherit inode extent size */
-#define XFS_XFLAG_NODEFRAG     0x00002000      /* do not defragment */
-#define XFS_XFLAG_FILESTREAM   0x00004000      /* use filestream allocator */
-#define XFS_XFLAG_HASATTR      0x80000000      /* no DIFLAG for this   */
-
 /*
  * Structure for XFS_IOC_GETBMAP.
  * On input, fill in bmv_offset and bmv_length of the first structure
@@ -514,8 +480,8 @@ typedef struct xfs_swapext
 #define XFS_IOC_ALLOCSP                _IOW ('X', 10, struct xfs_flock64)
 #define XFS_IOC_FREESP         _IOW ('X', 11, struct xfs_flock64)
 #define XFS_IOC_DIOINFO                _IOR ('X', 30, struct dioattr)
-#define XFS_IOC_FSGETXATTR     _IOR ('X', 31, struct fsxattr)
-#define XFS_IOC_FSSETXATTR     _IOW ('X', 32, struct fsxattr)
+#define XFS_IOC_FSGETXATTR     FS_IOC_FSGETXATTR
+#define XFS_IOC_FSSETXATTR     FS_IOC_FSSETXATTR
 #define XFS_IOC_ALLOCSP64      _IOW ('X', 36, struct xfs_flock64)
 #define XFS_IOC_FREESP64       _IOW ('X', 37, struct xfs_flock64)
 #define XFS_IOC_GETBMAP                _IOWR('X', 38, struct getbmap)
index daed4bfb85b2f2d56a3e82fa5ba4156f9f41011d..435c7de42e5f322a82845382ad7e1fa54dfe3d0b 100644 (file)
@@ -1527,6 +1527,16 @@ xfs_wait_buftarg(
        LIST_HEAD(dispose);
        int loop = 0;
 
+       /*
+        * We need to flush the buffer workqueue to ensure that all IO
+        * completion processing is 100% done. Just waiting on buffer locks is
+        * not sufficient for async IO as the reference count held over IO is
+        * not released until after the buffer lock is dropped. Hence we need to
+        * ensure here that all reference counts have been dropped before we
+        * start walking the LRU list.
+        */
+       drain_workqueue(btp->bt_mount->m_buf_workqueue);
+
        /* loop until there is nothing left on the lru list. */
        while (list_lru_count(&btp->bt_lru)) {
                list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele,
index ebe9b8290a705c402759c985c015b0bbc5f6a151..52883ac3cf84c06761afcf0792bbafcf781d509b 100644 (file)
@@ -55,7 +55,7 @@ xfs_rw_ilock(
        int                     type)
 {
        if (type & XFS_IOLOCK_EXCL)
-               mutex_lock(&VFS_I(ip)->i_mutex);
+               inode_lock(VFS_I(ip));
        xfs_ilock(ip, type);
 }
 
@@ -66,7 +66,7 @@ xfs_rw_iunlock(
 {
        xfs_iunlock(ip, type);
        if (type & XFS_IOLOCK_EXCL)
-               mutex_unlock(&VFS_I(ip)->i_mutex);
+               inode_unlock(VFS_I(ip));
 }
 
 static inline void
@@ -76,7 +76,7 @@ xfs_rw_ilock_demote(
 {
        xfs_ilock_demote(ip, type);
        if (type & XFS_IOLOCK_EXCL)
-               mutex_unlock(&VFS_I(ip)->i_mutex);
+               inode_unlock(VFS_I(ip));
 }
 
 /*
@@ -1610,9 +1610,8 @@ xfs_filemap_pmd_fault(
 /*
  * pfn_mkwrite was originally inteneded to ensure we capture time stamp
  * updates on write faults. In reality, it's need to serialise against
- * truncate similar to page_mkwrite. Hence we open-code dax_pfn_mkwrite()
- * here and cycle the XFS_MMAPLOCK_SHARED to ensure we serialise the fault
- * barrier in place.
+ * truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED
+ * to ensure we serialise the fault barrier in place.
  */
 static int
 xfs_filemap_pfn_mkwrite(
@@ -1635,6 +1634,8 @@ xfs_filemap_pfn_mkwrite(
        size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
        if (vmf->pgoff >= size)
                ret = VM_FAULT_SIGBUS;
+       else if (IS_DAX(inode))
+               ret = dax_pfn_mkwrite(vma, vmf);
        xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
        sb_end_pagefault(inode->i_sb);
        return ret;
index ae3758a90ed63ba6b714ccbdafdf007207a8be07..ceba1a83cacccd649caf473ebcf2dfae984bb243 100644 (file)
@@ -610,60 +610,69 @@ __xfs_iflock(
 
 STATIC uint
 _xfs_dic2xflags(
-       __uint16_t              di_flags)
+       __uint16_t              di_flags,
+       uint64_t                di_flags2,
+       bool                    has_attr)
 {
        uint                    flags = 0;
 
        if (di_flags & XFS_DIFLAG_ANY) {
                if (di_flags & XFS_DIFLAG_REALTIME)
-                       flags |= XFS_XFLAG_REALTIME;
+                       flags |= FS_XFLAG_REALTIME;
                if (di_flags & XFS_DIFLAG_PREALLOC)
-                       flags |= XFS_XFLAG_PREALLOC;
+                       flags |= FS_XFLAG_PREALLOC;
                if (di_flags & XFS_DIFLAG_IMMUTABLE)
-                       flags |= XFS_XFLAG_IMMUTABLE;
+                       flags |= FS_XFLAG_IMMUTABLE;
                if (di_flags & XFS_DIFLAG_APPEND)
-                       flags |= XFS_XFLAG_APPEND;
+                       flags |= FS_XFLAG_APPEND;
                if (di_flags & XFS_DIFLAG_SYNC)
-                       flags |= XFS_XFLAG_SYNC;
+                       flags |= FS_XFLAG_SYNC;
                if (di_flags & XFS_DIFLAG_NOATIME)
-                       flags |= XFS_XFLAG_NOATIME;
+                       flags |= FS_XFLAG_NOATIME;
                if (di_flags & XFS_DIFLAG_NODUMP)
-                       flags |= XFS_XFLAG_NODUMP;
+                       flags |= FS_XFLAG_NODUMP;
                if (di_flags & XFS_DIFLAG_RTINHERIT)
-                       flags |= XFS_XFLAG_RTINHERIT;
+                       flags |= FS_XFLAG_RTINHERIT;
                if (di_flags & XFS_DIFLAG_PROJINHERIT)
-                       flags |= XFS_XFLAG_PROJINHERIT;
+                       flags |= FS_XFLAG_PROJINHERIT;
                if (di_flags & XFS_DIFLAG_NOSYMLINKS)
-                       flags |= XFS_XFLAG_NOSYMLINKS;
+                       flags |= FS_XFLAG_NOSYMLINKS;
                if (di_flags & XFS_DIFLAG_EXTSIZE)
-                       flags |= XFS_XFLAG_EXTSIZE;
+                       flags |= FS_XFLAG_EXTSIZE;
                if (di_flags & XFS_DIFLAG_EXTSZINHERIT)
-                       flags |= XFS_XFLAG_EXTSZINHERIT;
+                       flags |= FS_XFLAG_EXTSZINHERIT;
                if (di_flags & XFS_DIFLAG_NODEFRAG)
-                       flags |= XFS_XFLAG_NODEFRAG;
+                       flags |= FS_XFLAG_NODEFRAG;
                if (di_flags & XFS_DIFLAG_FILESTREAM)
-                       flags |= XFS_XFLAG_FILESTREAM;
+                       flags |= FS_XFLAG_FILESTREAM;
        }
 
+       if (di_flags2 & XFS_DIFLAG2_ANY) {
+               if (di_flags2 & XFS_DIFLAG2_DAX)
+                       flags |= FS_XFLAG_DAX;
+       }
+
+       if (has_attr)
+               flags |= FS_XFLAG_HASATTR;
+
        return flags;
 }
 
 uint
 xfs_ip2xflags(
-       xfs_inode_t             *ip)
+       struct xfs_inode        *ip)
 {
-       xfs_icdinode_t          *dic = &ip->i_d;
+       struct xfs_icdinode     *dic = &ip->i_d;
 
-       return _xfs_dic2xflags(dic->di_flags) |
-                               (XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0);
+       return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip));
 }
 
 uint
 xfs_dic2xflags(
-       xfs_dinode_t            *dip)
+       struct xfs_dinode       *dip)
 {
-       return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) |
-                               (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0);
+       return _xfs_dic2xflags(be16_to_cpu(dip->di_flags),
+                               be64_to_cpu(dip->di_flags2), XFS_DFORK_Q(dip));
 }
 
 /*
@@ -862,7 +871,8 @@ xfs_ialloc(
        case S_IFREG:
        case S_IFDIR:
                if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
-                       uint    di_flags = 0;
+                       uint64_t        di_flags2 = 0;
+                       uint            di_flags = 0;
 
                        if (S_ISDIR(mode)) {
                                if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
@@ -898,7 +908,11 @@ xfs_ialloc(
                                di_flags |= XFS_DIFLAG_NODEFRAG;
                        if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
                                di_flags |= XFS_DIFLAG_FILESTREAM;
+                       if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
+                               di_flags2 |= XFS_DIFLAG2_DAX;
+
                        ip->i_d.di_flags |= di_flags;
+                       ip->i_d.di_flags2 |= di_flags2;
                }
                /* FALLTHROUGH */
        case S_IFLNK:
index d42738deec6de6128b1a4b0784c45b5447848d02..478d04e07f9500d6ceed20231deb6b474161c708 100644 (file)
@@ -859,25 +859,25 @@ xfs_merge_ioc_xflags(
        unsigned int    xflags = start;
 
        if (flags & FS_IMMUTABLE_FL)
-               xflags |= XFS_XFLAG_IMMUTABLE;
+               xflags |= FS_XFLAG_IMMUTABLE;
        else
-               xflags &= ~XFS_XFLAG_IMMUTABLE;
+               xflags &= ~FS_XFLAG_IMMUTABLE;
        if (flags & FS_APPEND_FL)
-               xflags |= XFS_XFLAG_APPEND;
+               xflags |= FS_XFLAG_APPEND;
        else
-               xflags &= ~XFS_XFLAG_APPEND;
+               xflags &= ~FS_XFLAG_APPEND;
        if (flags & FS_SYNC_FL)
-               xflags |= XFS_XFLAG_SYNC;
+               xflags |= FS_XFLAG_SYNC;
        else
-               xflags &= ~XFS_XFLAG_SYNC;
+               xflags &= ~FS_XFLAG_SYNC;
        if (flags & FS_NOATIME_FL)
-               xflags |= XFS_XFLAG_NOATIME;
+               xflags |= FS_XFLAG_NOATIME;
        else
-               xflags &= ~XFS_XFLAG_NOATIME;
+               xflags &= ~FS_XFLAG_NOATIME;
        if (flags & FS_NODUMP_FL)
-               xflags |= XFS_XFLAG_NODUMP;
+               xflags |= FS_XFLAG_NODUMP;
        else
-               xflags &= ~XFS_XFLAG_NODUMP;
+               xflags &= ~FS_XFLAG_NODUMP;
 
        return xflags;
 }
@@ -945,40 +945,51 @@ xfs_set_diflags(
        unsigned int            xflags)
 {
        unsigned int            di_flags;
+       uint64_t                di_flags2;
 
        /* can't set PREALLOC this way, just preserve it */
        di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
-       if (xflags & XFS_XFLAG_IMMUTABLE)
+       if (xflags & FS_XFLAG_IMMUTABLE)
                di_flags |= XFS_DIFLAG_IMMUTABLE;
-       if (xflags & XFS_XFLAG_APPEND)
+       if (xflags & FS_XFLAG_APPEND)
                di_flags |= XFS_DIFLAG_APPEND;
-       if (xflags & XFS_XFLAG_SYNC)
+       if (xflags & FS_XFLAG_SYNC)
                di_flags |= XFS_DIFLAG_SYNC;
-       if (xflags & XFS_XFLAG_NOATIME)
+       if (xflags & FS_XFLAG_NOATIME)
                di_flags |= XFS_DIFLAG_NOATIME;
-       if (xflags & XFS_XFLAG_NODUMP)
+       if (xflags & FS_XFLAG_NODUMP)
                di_flags |= XFS_DIFLAG_NODUMP;
-       if (xflags & XFS_XFLAG_NODEFRAG)
+       if (xflags & FS_XFLAG_NODEFRAG)
                di_flags |= XFS_DIFLAG_NODEFRAG;
-       if (xflags & XFS_XFLAG_FILESTREAM)
+       if (xflags & FS_XFLAG_FILESTREAM)
                di_flags |= XFS_DIFLAG_FILESTREAM;
        if (S_ISDIR(ip->i_d.di_mode)) {
-               if (xflags & XFS_XFLAG_RTINHERIT)
+               if (xflags & FS_XFLAG_RTINHERIT)
                        di_flags |= XFS_DIFLAG_RTINHERIT;
-               if (xflags & XFS_XFLAG_NOSYMLINKS)
+               if (xflags & FS_XFLAG_NOSYMLINKS)
                        di_flags |= XFS_DIFLAG_NOSYMLINKS;
-               if (xflags & XFS_XFLAG_EXTSZINHERIT)
+               if (xflags & FS_XFLAG_EXTSZINHERIT)
                        di_flags |= XFS_DIFLAG_EXTSZINHERIT;
-               if (xflags & XFS_XFLAG_PROJINHERIT)
+               if (xflags & FS_XFLAG_PROJINHERIT)
                        di_flags |= XFS_DIFLAG_PROJINHERIT;
        } else if (S_ISREG(ip->i_d.di_mode)) {
-               if (xflags & XFS_XFLAG_REALTIME)
+               if (xflags & FS_XFLAG_REALTIME)
                        di_flags |= XFS_DIFLAG_REALTIME;
-               if (xflags & XFS_XFLAG_EXTSIZE)
+               if (xflags & FS_XFLAG_EXTSIZE)
                        di_flags |= XFS_DIFLAG_EXTSIZE;
        }
-
        ip->i_d.di_flags = di_flags;
+
+       /* diflags2 only valid for v3 inodes. */
+       if (ip->i_d.di_version < 3)
+               return;
+
+       di_flags2 = 0;
+       if (xflags & FS_XFLAG_DAX)
+               di_flags2 |= XFS_DIFLAG2_DAX;
+
+       ip->i_d.di_flags2 = di_flags2;
+
 }
 
 STATIC void
@@ -988,22 +999,27 @@ xfs_diflags_to_linux(
        struct inode            *inode = VFS_I(ip);
        unsigned int            xflags = xfs_ip2xflags(ip);
 
-       if (xflags & XFS_XFLAG_IMMUTABLE)
+       if (xflags & FS_XFLAG_IMMUTABLE)
                inode->i_flags |= S_IMMUTABLE;
        else
                inode->i_flags &= ~S_IMMUTABLE;
-       if (xflags & XFS_XFLAG_APPEND)
+       if (xflags & FS_XFLAG_APPEND)
                inode->i_flags |= S_APPEND;
        else
                inode->i_flags &= ~S_APPEND;
-       if (xflags & XFS_XFLAG_SYNC)
+       if (xflags & FS_XFLAG_SYNC)
                inode->i_flags |= S_SYNC;
        else
                inode->i_flags &= ~S_SYNC;
-       if (xflags & XFS_XFLAG_NOATIME)
+       if (xflags & FS_XFLAG_NOATIME)
                inode->i_flags |= S_NOATIME;
        else
                inode->i_flags &= ~S_NOATIME;
+       if (xflags & FS_XFLAG_DAX)
+               inode->i_flags |= S_DAX;
+       else
+               inode->i_flags &= ~S_DAX;
+
 }
 
 static int
@@ -1016,11 +1032,11 @@ xfs_ioctl_setattr_xflags(
 
        /* Can't change realtime flag if any extents are allocated. */
        if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
-           XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & XFS_XFLAG_REALTIME))
+           XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & FS_XFLAG_REALTIME))
                return -EINVAL;
 
        /* If realtime flag is set then must have realtime device */
-       if (fa->fsx_xflags & XFS_XFLAG_REALTIME) {
+       if (fa->fsx_xflags & FS_XFLAG_REALTIME) {
                if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 ||
                    (ip->i_d.di_extsize % mp->m_sb.sb_rextsize))
                        return -EINVAL;
@@ -1031,7 +1047,7 @@ xfs_ioctl_setattr_xflags(
         * we have appropriate permission.
         */
        if (((ip->i_d.di_flags & (XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND)) ||
-            (fa->fsx_xflags & (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
+            (fa->fsx_xflags & (FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND))) &&
            !capable(CAP_LINUX_IMMUTABLE))
                return -EPERM;
 
@@ -1095,8 +1111,8 @@ out_cancel:
  * extent size hint validation is somewhat cumbersome. Rules are:
  *
  * 1. extent size hint is only valid for directories and regular files
- * 2. XFS_XFLAG_EXTSIZE is only valid for regular files
- * 3. XFS_XFLAG_EXTSZINHERIT is only valid for directories.
+ * 2. FS_XFLAG_EXTSIZE is only valid for regular files
+ * 3. FS_XFLAG_EXTSZINHERIT is only valid for directories.
  * 4. can only be changed on regular files if no extents are allocated
  * 5. can be changed on directories at any time
  * 6. extsize hint of 0 turns off hints, clears inode flags.
@@ -1112,10 +1128,10 @@ xfs_ioctl_setattr_check_extsize(
 {
        struct xfs_mount        *mp = ip->i_mount;
 
-       if ((fa->fsx_xflags & XFS_XFLAG_EXTSIZE) && !S_ISREG(ip->i_d.di_mode))
+       if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(ip->i_d.di_mode))
                return -EINVAL;
 
-       if ((fa->fsx_xflags & XFS_XFLAG_EXTSZINHERIT) &&
+       if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) &&
            !S_ISDIR(ip->i_d.di_mode))
                return -EINVAL;
 
@@ -1132,7 +1148,7 @@ xfs_ioctl_setattr_check_extsize(
                        return -EINVAL;
 
                if (XFS_IS_REALTIME_INODE(ip) ||
-                   (fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
+                   (fa->fsx_xflags & FS_XFLAG_REALTIME)) {
                        size = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog;
                } else {
                        size = mp->m_sb.sb_blocksize;
@@ -1143,7 +1159,7 @@ xfs_ioctl_setattr_check_extsize(
                if (fa->fsx_extsize % size)
                        return -EINVAL;
        } else
-               fa->fsx_xflags &= ~(XFS_XFLAG_EXTSIZE | XFS_XFLAG_EXTSZINHERIT);
+               fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT);
 
        return 0;
 }
@@ -1168,7 +1184,7 @@ xfs_ioctl_setattr_check_projid(
 
        if (xfs_get_projid(ip) != fa->fsx_projid)
                return -EINVAL;
-       if ((fa->fsx_xflags & XFS_XFLAG_PROJINHERIT) !=
+       if ((fa->fsx_xflags & FS_XFLAG_PROJINHERIT) !=
            (ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT))
                return -EINVAL;
 
index 06eafafe636e20f90e8f696ee20c0bb01b64ea8f..76b71a1c6c323e2043aeab1e93fb5a66db9f39d0 100644 (file)
@@ -1205,8 +1205,8 @@ xfs_diflags_to_iflags(
                inode->i_flags |= S_SYNC;
        if (flags & XFS_DIFLAG_NOATIME)
                inode->i_flags |= S_NOATIME;
-       /* XXX: Also needs an on-disk per inode flag! */
-       if (ip->i_mount->m_flags & XFS_MOUNT_DAX)
+       if (ip->i_mount->m_flags & XFS_MOUNT_DAX ||
+           ip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
                inode->i_flags |= S_DAX;
 }
 
index dc6221942b85f437767decfca9bf7ca9b0a94026..ade236e90bb3612d429a8b6b0909b3937302096c 100644 (file)
@@ -42,11 +42,11 @@ xfs_break_layouts(
        while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
                xfs_iunlock(ip, *iolock);
                if (with_imutex && (*iolock & XFS_IOLOCK_EXCL))
-                       mutex_unlock(&inode->i_mutex);
+                       inode_unlock(inode);
                error = break_layout(inode, true);
                *iolock = XFS_IOLOCK_EXCL;
                if (with_imutex)
-                       mutex_lock(&inode->i_mutex);
+                       inode_lock(inode);
                xfs_ilock(ip, *iolock);
        }
 
index aa67339b953722b4e5c73442ac14835f0f9c4e32..4f18fd92ca13b21d8fd68e955e082d9db9a61195 100644 (file)
@@ -497,7 +497,6 @@ xfsaild(
        long            tout = 0;       /* milliseconds */
 
        current->flags |= PF_MEMALLOC;
-       set_freezable();
 
        while (!kthread_should_stop()) {
                if (tout && tout <= 20)
index 3d69c93d50e83d7b804f9de7260007ffc9903d69..6361892ea737137899c5d9489634446cb3bdedac 100644 (file)
@@ -204,6 +204,7 @@ struct crypto_ahash {
                      unsigned int keylen);
 
        unsigned int reqsize;
+       bool has_setkey;
        struct crypto_tfm base;
 };
 
@@ -375,6 +376,11 @@ static inline void *ahash_request_ctx(struct ahash_request *req)
 int crypto_ahash_setkey(struct crypto_ahash *tfm, const u8 *key,
                        unsigned int keylen);
 
+static inline bool crypto_ahash_has_setkey(struct crypto_ahash *tfm)
+{
+       return tfm->has_setkey;
+}
+
 /**
  * crypto_ahash_finup() - update and finalize message digest
  * @req: reference to the ahash_request handle that holds all information
index 018afb264ac261ea5dea2f3afa547cf9a8f7a202..a2bfd7843f18f6e79d8bc7e5743b031345153053 100644 (file)
@@ -30,6 +30,9 @@ struct alg_sock {
 
        struct sock *parent;
 
+       unsigned int refcnt;
+       unsigned int nokey_refcnt;
+
        const struct af_alg_type *type;
        void *private;
 };
@@ -50,9 +53,11 @@ struct af_alg_type {
        void (*release)(void *private);
        int (*setkey)(void *private, const u8 *key, unsigned int keylen);
        int (*accept)(void *private, struct sock *sk);
+       int (*accept_nokey)(void *private, struct sock *sk);
        int (*setauthsize)(void *private, unsigned int authsize);
 
        struct proto_ops *ops;
+       struct proto_ops *ops_nokey;
        struct module *owner;
        char name[14];
 };
@@ -67,6 +72,7 @@ int af_alg_register_type(const struct af_alg_type *type);
 int af_alg_unregister_type(const struct af_alg_type *type);
 
 int af_alg_release(struct socket *sock);
+void af_alg_release_parent(struct sock *sk);
 int af_alg_accept(struct sock *sk, struct socket *newsock);
 
 int af_alg_make_sg(struct af_alg_sgl *sgl, struct iov_iter *iter, int len);
@@ -83,11 +89,6 @@ static inline struct alg_sock *alg_sk(struct sock *sk)
        return (struct alg_sock *)sk;
 }
 
-static inline void af_alg_release_parent(struct sock *sk)
-{
-       sock_put(alg_sk(sk)->parent);
-}
-
 static inline void af_alg_init_completion(struct af_alg_completion *completion)
 {
        init_completion(&completion->completion);
index d8dd41fb034fe5af52b68699096abb6ebc5559d5..fd8742a40ff3e93476ec3b8cdb1370200275b6d3 100644 (file)
@@ -61,6 +61,8 @@ struct crypto_skcipher {
        unsigned int ivsize;
        unsigned int reqsize;
 
+       bool has_setkey;
+
        struct crypto_tfm base;
 };
 
@@ -305,6 +307,11 @@ static inline int crypto_skcipher_setkey(struct crypto_skcipher *tfm,
        return tfm->setkey(tfm, key, keylen);
 }
 
+static inline bool crypto_skcipher_has_setkey(struct crypto_skcipher *tfm)
+{
+       return tfm->has_setkey;
+}
+
 /**
  * crypto_skcipher_reqtfm() - obtain cipher handle from request
  * @req: skcipher_request out of which the cipher handle is to be obtained
index 744b997d6a94ae01852c190d195f89fa3c4124ae..164049357e5cee3a54d653d883f8639fc8e85243 100644 (file)
@@ -7,6 +7,7 @@
 #ifndef _AER_H_
 #define _AER_H_
 
+#include <linux/errno.h>
 #include <linux/types.h>
 
 #define AER_NONFATAL                   0
index b9b6e046b52ea5b9f783b69274fbdd6ac8fe5d23..5349e6816cbb075e2b5d6ac57cac1e8fee373438 100644 (file)
@@ -318,16 +318,6 @@ enum bip_flags {
        BIP_IP_CHECKSUM         = 1 << 4, /* IP checksum */
 };
 
-#if defined(CONFIG_BLK_DEV_INTEGRITY)
-
-static inline struct bio_integrity_payload *bio_integrity(struct bio *bio)
-{
-       if (bio->bi_rw & REQ_INTEGRITY)
-               return bio->bi_integrity;
-
-       return NULL;
-}
-
 /*
  * bio integrity payload
  */
@@ -349,6 +339,16 @@ struct bio_integrity_payload {
        struct bio_vec          bip_inline_vecs[0];/* embedded bvec array */
 };
 
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+
+static inline struct bio_integrity_payload *bio_integrity(struct bio *bio)
+{
+       if (bio->bi_rw & REQ_INTEGRITY)
+               return bio->bi_integrity;
+
+       return NULL;
+}
+
 static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag)
 {
        struct bio_integrity_payload *bip = bio_integrity(bio);
@@ -795,6 +795,18 @@ static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag)
        return false;
 }
 
+static inline void *bio_integrity_alloc(struct bio * bio, gfp_t gfp,
+                                                               unsigned int nr)
+{
+       return ERR_PTR(-EINVAL);
+}
+
+static inline int bio_integrity_add_page(struct bio *bio, struct page *page,
+                                       unsigned int len, unsigned int offset)
+{
+       return 0;
+}
+
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
 #endif /* CONFIG_BLOCK */
diff --git a/include/linux/blk-iopoll.h b/include/linux/blk-iopoll.h
deleted file mode 100644 (file)
index 77ae77c..0000000
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifndef BLK_IOPOLL_H
-#define BLK_IOPOLL_H
-
-struct blk_iopoll;
-typedef int (blk_iopoll_fn)(struct blk_iopoll *, int);
-
-struct blk_iopoll {
-       struct list_head list;
-       unsigned long state;
-       unsigned long data;
-       int weight;
-       int max;
-       blk_iopoll_fn *poll;
-};
-
-enum {
-       IOPOLL_F_SCHED          = 0,
-       IOPOLL_F_DISABLE        = 1,
-};
-
-/*
- * Returns 0 if we successfully set the IOPOLL_F_SCHED bit, indicating
- * that we were the first to acquire this iop for scheduling. If this iop
- * is currently disabled, return "failure".
- */
-static inline int blk_iopoll_sched_prep(struct blk_iopoll *iop)
-{
-       if (!test_bit(IOPOLL_F_DISABLE, &iop->state))
-               return test_and_set_bit(IOPOLL_F_SCHED, &iop->state);
-
-       return 1;
-}
-
-static inline int blk_iopoll_disable_pending(struct blk_iopoll *iop)
-{
-       return test_bit(IOPOLL_F_DISABLE, &iop->state);
-}
-
-extern void blk_iopoll_sched(struct blk_iopoll *);
-extern void blk_iopoll_init(struct blk_iopoll *, int, blk_iopoll_fn *);
-extern void blk_iopoll_complete(struct blk_iopoll *);
-extern void __blk_iopoll_complete(struct blk_iopoll *);
-extern void blk_iopoll_enable(struct blk_iopoll *);
-extern void blk_iopoll_disable(struct blk_iopoll *);
-
-#endif
index 0fb65843ec1e8522461dfe2d308c06e6fe33e2af..86a38ea1823f3307caec422941fcb15dcf7c4e25 100644 (file)
@@ -188,7 +188,6 @@ enum rq_flag_bits {
        __REQ_PM,               /* runtime pm request */
        __REQ_HASHED,           /* on IO scheduler merge hash */
        __REQ_MQ_INFLIGHT,      /* track inflight for MQ */
-       __REQ_NO_TIMEOUT,       /* requests may never expire */
        __REQ_NR_BITS,          /* stops here */
 };
 
@@ -242,7 +241,6 @@ enum rq_flag_bits {
 #define REQ_PM                 (1ULL << __REQ_PM)
 #define REQ_HASHED             (1ULL << __REQ_HASHED)
 #define REQ_MQ_INFLIGHT                (1ULL << __REQ_MQ_INFLIGHT)
-#define REQ_NO_TIMEOUT         (1ULL << __REQ_NO_TIMEOUT)
 
 typedef unsigned int blk_qc_t;
 #define BLK_QC_T_NONE  -1U
index d372ea87ead573e4b4c7d5369ac649f844124334..29189aeace19df0650c15e374e60724ce77ac917 100644 (file)
@@ -409,6 +409,7 @@ struct request_queue {
 
        unsigned int            rq_timeout;
        struct timer_list       timeout;
+       struct work_struct      timeout_work;
        struct list_head        timeout_list;
 
        struct list_head        icq_list;
index 5babb8e95352a6a7468feca70a343fafb56bddad..b827e066e55a198ec13f41d52b52964bd72edf4d 100644 (file)
@@ -40,46 +40,11 @@ static inline __u32 ceph_frag_mask_shift(__u32 f)
        return 24 - ceph_frag_bits(f);
 }
 
-static inline int ceph_frag_contains_value(__u32 f, __u32 v)
+static inline bool ceph_frag_contains_value(__u32 f, __u32 v)
 {
        return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
 }
-static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
-{
-       /* is sub as specific as us, and contained by us? */
-       return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
-              (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
-}
 
-static inline __u32 ceph_frag_parent(__u32 f)
-{
-       return ceph_frag_make(ceph_frag_bits(f) - 1,
-                        ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
-}
-static inline int ceph_frag_is_left_child(__u32 f)
-{
-       return ceph_frag_bits(f) > 0 &&
-               (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
-}
-static inline int ceph_frag_is_right_child(__u32 f)
-{
-       return ceph_frag_bits(f) > 0 &&
-               (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
-}
-static inline __u32 ceph_frag_sibling(__u32 f)
-{
-       return ceph_frag_make(ceph_frag_bits(f),
-                     ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
-}
-static inline __u32 ceph_frag_left_child(__u32 f)
-{
-       return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
-}
-static inline __u32 ceph_frag_right_child(__u32 f)
-{
-       return ceph_frag_make(ceph_frag_bits(f)+1,
-             ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
-}
 static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
 {
        int newbits = ceph_frag_bits(f) + by;
index 71b1d6cdcb5d1fc53dd45fa3950e138cfac19dfe..8dbd7879fdc6b74719d9cb65b85d8c60f589bde6 100644 (file)
@@ -220,6 +220,7 @@ struct ceph_connection {
        struct ceph_entity_addr actual_peer_addr;
 
        /* message out temps */
+       struct ceph_msg_header out_hdr;
        struct ceph_msg *out_msg;        /* sending message (== tail of
                                            out_sent) */
        bool out_msg_done;
@@ -229,7 +230,6 @@ struct ceph_connection {
        int out_kvec_left;   /* kvec's left in out_kvec */
        int out_skip;        /* skip this many bytes */
        int out_kvec_bytes;  /* total bytes left */
-       bool out_kvec_is_msg; /* kvec refers to out_msg */
        int out_more;        /* there is more data after the kvecs */
        __le64 out_temp_ack; /* for writing an ack */
        struct ceph_timespec out_temp_keepalive2; /* for writing keepalive2
index b415e521528de3d5fc5db598fb588f688f42d307..8204c3dc3800f37839513d7f2c0567d6a41f87a7 100644 (file)
@@ -36,4 +36,11 @@ static inline bool vma_is_dax(struct vm_area_struct *vma)
 {
        return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host);
 }
+
+static inline bool dax_mapping(struct address_space *mapping)
+{
+       return mapping->host && IS_DAX(mapping->host);
+}
+int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
+               loff_t end);
 #endif
index 8723f2a99e1588d7feba41625ebdbc9c924e94d0..d6b3c9943a2c49a6318f37ca6128e000e913c654 100644 (file)
@@ -25,7 +25,6 @@
 */
 #ifndef DRBD_H
 #define DRBD_H
-#include <linux/connector.h>
 #include <asm/types.h>
 
 #ifdef __KERNEL__
@@ -52,7 +51,7 @@
 #endif
 
 extern const char *drbd_buildtag(void);
-#define REL_VERSION "8.4.5"
+#define REL_VERSION "8.4.6"
 #define API_VERSION 1
 #define PRO_VERSION_MIN 86
 #define PRO_VERSION_MAX 101
@@ -339,6 +338,8 @@ enum drbd_state_rv {
 #define MDF_AL_CLEAN           (1 << 7)
 #define MDF_AL_DISABLED                (1 << 8)
 
+#define MAX_PEERS 32
+
 enum drbd_uuid_index {
        UI_CURRENT,
        UI_BITMAP,
@@ -349,14 +350,35 @@ enum drbd_uuid_index {
        UI_EXTENDED_SIZE   /* Everything. */
 };
 
+#define HISTORY_UUIDS MAX_PEERS
+
 enum drbd_timeout_flag {
        UT_DEFAULT      = 0,
        UT_DEGRADED     = 1,
        UT_PEER_OUTDATED = 2,
 };
 
+enum drbd_notification_type {
+       NOTIFY_EXISTS,
+       NOTIFY_CREATE,
+       NOTIFY_CHANGE,
+       NOTIFY_DESTROY,
+       NOTIFY_CALL,
+       NOTIFY_RESPONSE,
+
+       NOTIFY_CONTINUES = 0x8000,
+       NOTIFY_FLAGS = NOTIFY_CONTINUES,
+};
+
 #define UUID_JUST_CREATED ((__u64)4)
 
+enum write_ordering_e {
+       WO_NONE,
+       WO_DRAIN_IO,
+       WO_BDEV_FLUSH,
+       WO_BIO_BARRIER
+};
+
 /* magic numbers used in meta data and network packets */
 #define DRBD_MAGIC 0x83740267
 #define DRBD_MAGIC_BIG 0x835a
index 7b131ed8f9c6696cfb1ec8b470c0d77c95dff07e..2d0e5ad5de9d34227700acfee0e188eb6c752471 100644 (file)
@@ -250,6 +250,76 @@ GENL_struct(DRBD_NLA_DETACH_PARMS, 13, detach_parms,
        __flg_field(1, DRBD_GENLA_F_MANDATORY,  force_detach)
 )
 
+GENL_struct(DRBD_NLA_RESOURCE_INFO, 15, resource_info,
+       __u32_field(1, 0, res_role)
+       __flg_field(2, 0, res_susp)
+       __flg_field(3, 0, res_susp_nod)
+       __flg_field(4, 0, res_susp_fen)
+       /* __flg_field(5, 0, res_weak) */
+)
+
+GENL_struct(DRBD_NLA_DEVICE_INFO, 16, device_info,
+       __u32_field(1, 0, dev_disk_state)
+)
+
+GENL_struct(DRBD_NLA_CONNECTION_INFO, 17, connection_info,
+       __u32_field(1, 0, conn_connection_state)
+       __u32_field(2, 0, conn_role)
+)
+
+GENL_struct(DRBD_NLA_PEER_DEVICE_INFO, 18, peer_device_info,
+       __u32_field(1, 0, peer_repl_state)
+       __u32_field(2, 0, peer_disk_state)
+       __u32_field(3, 0, peer_resync_susp_user)
+       __u32_field(4, 0, peer_resync_susp_peer)
+       __u32_field(5, 0, peer_resync_susp_dependency)
+)
+
+GENL_struct(DRBD_NLA_RESOURCE_STATISTICS, 19, resource_statistics,
+       __u32_field(1, 0, res_stat_write_ordering)
+)
+
+GENL_struct(DRBD_NLA_DEVICE_STATISTICS, 20, device_statistics,
+       __u64_field(1, 0, dev_size)  /* (sectors) */
+       __u64_field(2, 0, dev_read)  /* (sectors) */
+       __u64_field(3, 0, dev_write)  /* (sectors) */
+       __u64_field(4, 0, dev_al_writes)  /* activity log writes (count) */
+       __u64_field(5, 0, dev_bm_writes)  /*  bitmap writes  (count) */
+       __u32_field(6, 0, dev_upper_pending)  /* application requests in progress */
+       __u32_field(7, 0, dev_lower_pending)  /* backing device requests in progress */
+       __flg_field(8, 0, dev_upper_blocked)
+       __flg_field(9, 0, dev_lower_blocked)
+       __flg_field(10, 0, dev_al_suspended)  /* activity log suspended */
+       __u64_field(11, 0, dev_exposed_data_uuid)
+       __u64_field(12, 0, dev_current_uuid)
+       __u32_field(13, 0, dev_disk_flags)
+       __bin_field(14, 0, history_uuids, HISTORY_UUIDS * sizeof(__u64))
+)
+
+GENL_struct(DRBD_NLA_CONNECTION_STATISTICS, 21, connection_statistics,
+       __flg_field(1, 0, conn_congested)
+)
+
+GENL_struct(DRBD_NLA_PEER_DEVICE_STATISTICS, 22, peer_device_statistics,
+       __u64_field(1, 0, peer_dev_received)  /* sectors */
+       __u64_field(2, 0, peer_dev_sent)  /* sectors */
+       __u32_field(3, 0, peer_dev_pending)  /* number of requests */
+       __u32_field(4, 0, peer_dev_unacked)  /* number of requests */
+       __u64_field(5, 0, peer_dev_out_of_sync)  /* sectors */
+       __u64_field(6, 0, peer_dev_resync_failed)  /* sectors */
+       __u64_field(7, 0, peer_dev_bitmap_uuid)
+       __u32_field(9, 0, peer_dev_flags)
+)
+
+GENL_struct(DRBD_NLA_NOTIFICATION_HEADER, 23, drbd_notification_header,
+       __u32_field(1, DRBD_GENLA_F_MANDATORY, nh_type)
+)
+
+GENL_struct(DRBD_NLA_HELPER, 24, drbd_helper_info,
+       __str_field(1, DRBD_GENLA_F_MANDATORY, helper_name, 32)
+       __u32_field(2, DRBD_GENLA_F_MANDATORY, helper_status)
+)
+
 /*
  * Notifications and commands (genlmsghdr->cmd)
  */
@@ -382,3 +452,82 @@ GENL_op(DRBD_ADM_GET_TIMEOUT_TYPE, 26, GENL_doit(drbd_adm_get_timeout_type),
        GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
 GENL_op(DRBD_ADM_DOWN,         27, GENL_doit(drbd_adm_down),
        GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
+
+GENL_op(DRBD_ADM_GET_RESOURCES, 30,
+        GENL_op_init(
+                .dumpit = drbd_adm_dump_resources,
+        ),
+        GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)
+        GENL_tla_expected(DRBD_NLA_RESOURCE_INFO, DRBD_GENLA_F_MANDATORY)
+        GENL_tla_expected(DRBD_NLA_RESOURCE_STATISTICS, DRBD_GENLA_F_MANDATORY))
+
+GENL_op(DRBD_ADM_GET_DEVICES, 31,
+        GENL_op_init(
+                .dumpit = drbd_adm_dump_devices,
+                .done = drbd_adm_dump_devices_done,
+        ),
+        GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)
+        GENL_tla_expected(DRBD_NLA_DEVICE_INFO, DRBD_GENLA_F_MANDATORY)
+        GENL_tla_expected(DRBD_NLA_DEVICE_STATISTICS, DRBD_GENLA_F_MANDATORY))
+
+GENL_op(DRBD_ADM_GET_CONNECTIONS, 32,
+        GENL_op_init(
+                .dumpit = drbd_adm_dump_connections,
+                .done = drbd_adm_dump_connections_done,
+        ),
+        GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)
+        GENL_tla_expected(DRBD_NLA_CONNECTION_INFO, DRBD_GENLA_F_MANDATORY)
+        GENL_tla_expected(DRBD_NLA_CONNECTION_STATISTICS, DRBD_GENLA_F_MANDATORY))
+
+GENL_op(DRBD_ADM_GET_PEER_DEVICES, 33,
+        GENL_op_init(
+                .dumpit = drbd_adm_dump_peer_devices,
+                .done = drbd_adm_dump_peer_devices_done,
+        ),
+        GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)
+        GENL_tla_expected(DRBD_NLA_PEER_DEVICE_INFO, DRBD_GENLA_F_MANDATORY)
+        GENL_tla_expected(DRBD_NLA_PEER_DEVICE_STATISTICS, DRBD_GENLA_F_MANDATORY))
+
+GENL_notification(
+       DRBD_RESOURCE_STATE, 34, events,
+       GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+       GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED)
+       GENL_tla_expected(DRBD_NLA_RESOURCE_INFO, DRBD_F_REQUIRED)
+       GENL_tla_expected(DRBD_NLA_RESOURCE_STATISTICS, DRBD_F_REQUIRED))
+
+GENL_notification(
+       DRBD_DEVICE_STATE, 35, events,
+       GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+       GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED)
+       GENL_tla_expected(DRBD_NLA_DEVICE_INFO, DRBD_F_REQUIRED)
+       GENL_tla_expected(DRBD_NLA_DEVICE_STATISTICS, DRBD_F_REQUIRED))
+
+GENL_notification(
+       DRBD_CONNECTION_STATE, 36, events,
+       GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+       GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED)
+       GENL_tla_expected(DRBD_NLA_CONNECTION_INFO, DRBD_F_REQUIRED)
+       GENL_tla_expected(DRBD_NLA_CONNECTION_STATISTICS, DRBD_F_REQUIRED))
+
+GENL_notification(
+       DRBD_PEER_DEVICE_STATE, 37, events,
+       GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+       GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED)
+       GENL_tla_expected(DRBD_NLA_PEER_DEVICE_INFO, DRBD_F_REQUIRED)
+       GENL_tla_expected(DRBD_NLA_PEER_DEVICE_STATISTICS, DRBD_F_REQUIRED))
+
+GENL_op(
+       DRBD_ADM_GET_INITIAL_STATE, 38,
+       GENL_op_init(
+               .dumpit = drbd_adm_get_initial_state,
+       ),
+       GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY))
+
+GENL_notification(
+       DRBD_HELPER, 40, events,
+       GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+       GENL_tla_expected(DRBD_NLA_HELPER, DRBD_F_REQUIRED))
+
+GENL_notification(
+       DRBD_INITIAL_STATE_DONE, 41, events,
+       GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED))
index eb73d74ed99262c83f2fa5e51582b2da0344bafa..1a2046275cdf0353e68a9cf38c5fe8762a62fdc2 100644 (file)
@@ -433,7 +433,8 @@ struct address_space {
        struct rw_semaphore     i_mmap_rwsem;   /* protect tree, count, list */
        /* Protected by tree_lock together with the radix tree */
        unsigned long           nrpages;        /* number of total pages */
-       unsigned long           nrshadows;      /* number of shadow entries */
+       /* number of shadow or DAX exceptional entries */
+       unsigned long           nrexceptional;
        pgoff_t                 writeback_index;/* writeback starts here */
        const struct address_space_operations *a_ops;   /* methods */
        unsigned long           flags;          /* error bits/gfp mask */
@@ -714,6 +715,31 @@ enum inode_i_mutex_lock_class
        I_MUTEX_PARENT2,
 };
 
+static inline void inode_lock(struct inode *inode)
+{
+       mutex_lock(&inode->i_mutex);
+}
+
+static inline void inode_unlock(struct inode *inode)
+{
+       mutex_unlock(&inode->i_mutex);
+}
+
+static inline int inode_trylock(struct inode *inode)
+{
+       return mutex_trylock(&inode->i_mutex);
+}
+
+static inline int inode_is_locked(struct inode *inode)
+{
+       return mutex_is_locked(&inode->i_mutex);
+}
+
+static inline void inode_lock_nested(struct inode *inode, unsigned subclass)
+{
+       mutex_lock_nested(&inode->i_mutex, subclass);
+}
+
 void lock_two_nondirectories(struct inode *, struct inode*);
 void unlock_two_nondirectories(struct inode *, struct inode*);
 
@@ -3047,8 +3073,8 @@ static inline bool dir_emit_dots(struct file *file, struct dir_context *ctx)
 }
 static inline bool dir_relax(struct inode *inode)
 {
-       mutex_unlock(&inode->i_mutex);
-       mutex_lock(&inode->i_mutex);
+       inode_unlock(inode);
+       inode_lock(inode);
        return !IS_DEADDIR(inode);
 }
 
index cfe81e10bd5429baf5ed0ccf50bd4613ff6ee3c6..459fd25b378e73cfd2e911761ad845076be547de 100644 (file)
@@ -120,15 +120,15 @@ extern void vma_adjust_trans_huge(struct vm_area_struct *vma,
                                    unsigned long start,
                                    unsigned long end,
                                    long adjust_next);
-extern bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
-               spinlock_t **ptl);
+extern spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd,
+               struct vm_area_struct *vma);
 /* mmap_sem must be held on entry */
-static inline bool pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
-               spinlock_t **ptl)
+static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
+               struct vm_area_struct *vma)
 {
        VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma);
        if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd))
-               return __pmd_trans_huge_lock(pmd, vma, ptl);
+               return __pmd_trans_huge_lock(pmd, vma);
        else
                return false;
 }
@@ -190,10 +190,10 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
                                         long adjust_next)
 {
 }
-static inline bool pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
-               spinlock_t **ptl)
+static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
+               struct vm_area_struct *vma)
 {
-       return false;
+       return NULL;
 }
 
 static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
index 013fd9bc4cb6cefb102705a88a6911de546acd0a..083d61e9270632be41a7b707104af7003d3c23cb 100644 (file)
@@ -135,6 +135,20 @@ static inline void *idr_find(struct idr *idr, int id)
 #define idr_for_each_entry(idp, entry, id)                     \
        for (id = 0; ((entry) = idr_get_next(idp, &(id))) != NULL; ++id)
 
+/**
+ * idr_for_each_entry - continue iteration over an idr's elements of a given type
+ * @idp:     idr handle
+ * @entry:   the type * to use as cursor
+ * @id:      id entry's key
+ *
+ * Continue to iterate over list of given type, continuing after
+ * the current position.
+ */
+#define idr_for_each_entry_continue(idp, entry, id)                    \
+       for ((entry) = idr_get_next((idp), &(id));                      \
+            entry;                                                     \
+            ++id, (entry) = idr_get_next((idp), &(id)))
+
 /*
  * IDA - IDR based id allocator, use when translation from id to
  * pointer isn't necessary.
index cb30edbfe9fcd010b217aa385073d54fa71105c4..0e95fcc75b2ac141ceeb3ebb3b2f79d7a5d2ddad 100644 (file)
@@ -413,7 +413,7 @@ enum
        NET_TX_SOFTIRQ,
        NET_RX_SOFTIRQ,
        BLOCK_SOFTIRQ,
-       BLOCK_IOPOLL_SOFTIRQ,
+       IRQ_POLL_SOFTIRQ,
        TASKLET_SOFTIRQ,
        SCHED_SOFTIRQ,
        HRTIMER_SOFTIRQ, /* Unused, but kept as tools rely on the
diff --git a/include/linux/irq_poll.h b/include/linux/irq_poll.h
new file mode 100644 (file)
index 0000000..3e8c1b8
--- /dev/null
@@ -0,0 +1,25 @@
+#ifndef IRQ_POLL_H
+#define IRQ_POLL_H
+
+struct irq_poll;
+typedef int (irq_poll_fn)(struct irq_poll *, int);
+
+struct irq_poll {
+       struct list_head list;
+       unsigned long state;
+       int weight;
+       irq_poll_fn *poll;
+};
+
+enum {
+       IRQ_POLL_F_SCHED        = 0,
+       IRQ_POLL_F_DISABLE      = 1,
+};
+
+extern void irq_poll_sched(struct irq_poll *);
+extern void irq_poll_init(struct irq_poll *, int, irq_poll_fn *);
+extern void irq_poll_complete(struct irq_poll *);
+extern void irq_poll_enable(struct irq_poll *);
+extern void irq_poll_disable(struct irq_poll *);
+
+#endif
index 034117b3be5f7d91e5e869ea9e856f51964ebac6..d6750111e48ecc4603dbc5f28eb01f2810ed6303 100644 (file)
@@ -1,6 +1,8 @@
 #ifndef NVM_H
 #define NVM_H
 
+#include <linux/types.h>
+
 enum {
        NVM_IO_OK = 0,
        NVM_IO_REQUEUE = 1,
@@ -11,12 +13,74 @@ enum {
        NVM_IOTYPE_GC = 1,
 };
 
+#define NVM_BLK_BITS (16)
+#define NVM_PG_BITS  (16)
+#define NVM_SEC_BITS (8)
+#define NVM_PL_BITS  (8)
+#define NVM_LUN_BITS (8)
+#define NVM_CH_BITS  (8)
+
+struct ppa_addr {
+       /* Generic structure for all addresses */
+       union {
+               struct {
+                       u64 blk         : NVM_BLK_BITS;
+                       u64 pg          : NVM_PG_BITS;
+                       u64 sec         : NVM_SEC_BITS;
+                       u64 pl          : NVM_PL_BITS;
+                       u64 lun         : NVM_LUN_BITS;
+                       u64 ch          : NVM_CH_BITS;
+               } g;
+
+               u64 ppa;
+       };
+};
+
+struct nvm_rq;
+struct nvm_id;
+struct nvm_dev;
+
+typedef int (nvm_l2p_update_fn)(u64, u32, __le64 *, void *);
+typedef int (nvm_bb_update_fn)(struct ppa_addr, int, u8 *, void *);
+typedef int (nvm_id_fn)(struct nvm_dev *, struct nvm_id *);
+typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32,
+                               nvm_l2p_update_fn *, void *);
+typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, int,
+                               nvm_bb_update_fn *, void *);
+typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct nvm_rq *, int);
+typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
+typedef int (nvm_erase_blk_fn)(struct nvm_dev *, struct nvm_rq *);
+typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *);
+typedef void (nvm_destroy_dma_pool_fn)(void *);
+typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t,
+                                                               dma_addr_t *);
+typedef void (nvm_dev_dma_free_fn)(void *, void*, dma_addr_t);
+
+struct nvm_dev_ops {
+       nvm_id_fn               *identity;
+       nvm_get_l2p_tbl_fn      *get_l2p_tbl;
+       nvm_op_bb_tbl_fn        *get_bb_tbl;
+       nvm_op_set_bb_fn        *set_bb_tbl;
+
+       nvm_submit_io_fn        *submit_io;
+       nvm_erase_blk_fn        *erase_block;
+
+       nvm_create_dma_pool_fn  *create_dma_pool;
+       nvm_destroy_dma_pool_fn *destroy_dma_pool;
+       nvm_dev_dma_alloc_fn    *dev_dma_alloc;
+       nvm_dev_dma_free_fn     *dev_dma_free;
+
+       unsigned int            max_phys_sect;
+};
+
+
+
 #ifdef CONFIG_NVM
 
 #include <linux/blkdev.h>
-#include <linux/types.h>
 #include <linux/file.h>
 #include <linux/dmapool.h>
+#include <uapi/linux/lightnvm.h>
 
 enum {
        /* HW Responsibilities */
@@ -58,8 +122,29 @@ enum {
        /* Block Types */
        NVM_BLK_T_FREE          = 0x0,
        NVM_BLK_T_BAD           = 0x1,
-       NVM_BLK_T_DEV           = 0x2,
-       NVM_BLK_T_HOST          = 0x4,
+       NVM_BLK_T_GRWN_BAD      = 0x2,
+       NVM_BLK_T_DEV           = 0x4,
+       NVM_BLK_T_HOST          = 0x8,
+
+       /* Memory capabilities */
+       NVM_ID_CAP_SLC          = 0x1,
+       NVM_ID_CAP_CMD_SUSPEND  = 0x2,
+       NVM_ID_CAP_SCRAMBLE     = 0x4,
+       NVM_ID_CAP_ENCRYPT      = 0x8,
+
+       /* Memory types */
+       NVM_ID_FMTYPE_SLC       = 0,
+       NVM_ID_FMTYPE_MLC       = 1,
+};
+
+struct nvm_id_lp_mlc {
+       u16     num_pairs;
+       u8      pairs[886];
+};
+
+struct nvm_id_lp_tbl {
+       __u8    id[8];
+       struct nvm_id_lp_mlc mlc;
 };
 
 struct nvm_id_group {
@@ -82,6 +167,8 @@ struct nvm_id_group {
        u32     mpos;
        u32     mccap;
        u16     cpar;
+
+       struct nvm_id_lp_tbl lptbl;
 };
 
 struct nvm_addr_format {
@@ -125,28 +212,8 @@ struct nvm_tgt_instance {
 #define NVM_VERSION_MINOR 0
 #define NVM_VERSION_PATCH 0
 
-#define NVM_BLK_BITS (16)
-#define NVM_PG_BITS  (16)
-#define NVM_SEC_BITS (8)
-#define NVM_PL_BITS  (8)
-#define NVM_LUN_BITS (8)
-#define NVM_CH_BITS  (8)
-
-struct ppa_addr {
-       /* Generic structure for all addresses */
-       union {
-               struct {
-                       u64 blk         : NVM_BLK_BITS;
-                       u64 pg          : NVM_PG_BITS;
-                       u64 sec         : NVM_SEC_BITS;
-                       u64 pl          : NVM_PL_BITS;
-                       u64 lun         : NVM_LUN_BITS;
-                       u64 ch          : NVM_CH_BITS;
-               } g;
-
-               u64 ppa;
-       };
-};
+struct nvm_rq;
+typedef void (nvm_end_io_fn)(struct nvm_rq *);
 
 struct nvm_rq {
        struct nvm_tgt_instance *ins;
@@ -164,9 +231,14 @@ struct nvm_rq {
        void *metadata;
        dma_addr_t dma_metadata;
 
+       struct completion *wait;
+       nvm_end_io_fn *end_io;
+
        uint8_t opcode;
        uint16_t nr_pages;
        uint16_t flags;
+
+       int error;
 };
 
 static inline struct nvm_rq *nvm_rq_from_pdu(void *pdu)
@@ -181,51 +253,31 @@ static inline void *nvm_rq_to_pdu(struct nvm_rq *rqdata)
 
 struct nvm_block;
 
-typedef int (nvm_l2p_update_fn)(u64, u32, __le64 *, void *);
-typedef int (nvm_bb_update_fn)(struct ppa_addr, int, u8 *, void *);
-typedef int (nvm_id_fn)(struct nvm_dev *, struct nvm_id *);
-typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32,
-                               nvm_l2p_update_fn *, void *);
-typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, int,
-                               nvm_bb_update_fn *, void *);
-typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct nvm_rq *, int);
-typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
-typedef int (nvm_erase_blk_fn)(struct nvm_dev *, struct nvm_rq *);
-typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *);
-typedef void (nvm_destroy_dma_pool_fn)(void *);
-typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t,
-                                                               dma_addr_t *);
-typedef void (nvm_dev_dma_free_fn)(void *, void*, dma_addr_t);
-
-struct nvm_dev_ops {
-       nvm_id_fn               *identity;
-       nvm_get_l2p_tbl_fn      *get_l2p_tbl;
-       nvm_op_bb_tbl_fn        *get_bb_tbl;
-       nvm_op_set_bb_fn        *set_bb_tbl;
-
-       nvm_submit_io_fn        *submit_io;
-       nvm_erase_blk_fn        *erase_block;
-
-       nvm_create_dma_pool_fn  *create_dma_pool;
-       nvm_destroy_dma_pool_fn *destroy_dma_pool;
-       nvm_dev_dma_alloc_fn    *dev_dma_alloc;
-       nvm_dev_dma_free_fn     *dev_dma_free;
-
-       unsigned int            max_phys_sect;
-};
-
 struct nvm_lun {
        int id;
 
        int lun_id;
        int chnl_id;
 
-       unsigned int nr_inuse_blocks;   /* Number of used blocks */
+       /* It is up to the target to mark blocks as closed. If the target does
+        * not do it, all blocks are marked as open, and nr_open_blocks
+        * represents the number of blocks in use
+        */
+       unsigned int nr_open_blocks;    /* Number of used, writable blocks */
+       unsigned int nr_closed_blocks;  /* Number of used, read-only blocks */
        unsigned int nr_free_blocks;    /* Number of unused blocks */
        unsigned int nr_bad_blocks;     /* Number of bad blocks */
-       struct nvm_block *blocks;
 
        spinlock_t lock;
+
+       struct nvm_block *blocks;
+};
+
+enum {
+       NVM_BLK_ST_FREE =       0x1,    /* Free block */
+       NVM_BLK_ST_OPEN =       0x2,    /* Open block - read-write */
+       NVM_BLK_ST_CLOSED =     0x4,    /* Closed block - read-only */
+       NVM_BLK_ST_BAD =        0x8,    /* Bad block */
 };
 
 struct nvm_block {
@@ -234,7 +286,16 @@ struct nvm_block {
        unsigned long id;
 
        void *priv;
-       int type;
+       int state;
+};
+
+/* system block cpu representation */
+struct nvm_sb_info {
+       unsigned long           seqnr;
+       unsigned long           erase_cnt;
+       unsigned int            version;
+       char                    mmtype[NVM_MMTYPE_LEN];
+       struct ppa_addr         fs_ppa;
 };
 
 struct nvm_dev {
@@ -247,6 +308,9 @@ struct nvm_dev {
        struct nvmm_type *mt;
        void *mp;
 
+       /* System blocks */
+       struct nvm_sb_info sb;
+
        /* Device information */
        int nr_chnls;
        int nr_planes;
@@ -256,6 +320,7 @@ struct nvm_dev {
        int blks_per_lun;
        int sec_size;
        int oob_size;
+       int mccap;
        struct nvm_addr_format ppaf;
 
        /* Calculated/Cached values. These do not reflect the actual usable
@@ -268,6 +333,10 @@ struct nvm_dev {
        int sec_per_blk;
        int sec_per_lun;
 
+       /* lower page table */
+       int lps_per_blk;
+       int *lptbl;
+
        unsigned long total_pages;
        unsigned long total_blocks;
        int nr_luns;
@@ -280,6 +349,8 @@ struct nvm_dev {
        /* Backend device */
        struct request_queue *q;
        char name[DISK_NAME_LEN];
+
+       struct mutex mlock;
 };
 
 static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev,
@@ -345,9 +416,13 @@ static inline struct ppa_addr block_to_ppa(struct nvm_dev *dev,
        return ppa;
 }
 
+static inline int ppa_to_slc(struct nvm_dev *dev, int slc_pg)
+{
+       return dev->lptbl[slc_pg];
+}
+
 typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *);
 typedef sector_t (nvm_tgt_capacity_fn)(void *);
-typedef int (nvm_tgt_end_io_fn)(struct nvm_rq *, int);
 typedef void *(nvm_tgt_init_fn)(struct nvm_dev *, struct gendisk *, int, int);
 typedef void (nvm_tgt_exit_fn)(void *);
 
@@ -358,7 +433,7 @@ struct nvm_tgt_type {
        /* target entry points */
        nvm_tgt_make_rq_fn *make_rq;
        nvm_tgt_capacity_fn *capacity;
-       nvm_tgt_end_io_fn *end_io;
+       nvm_end_io_fn *end_io;
 
        /* module-specific init/teardown */
        nvm_tgt_init_fn *init;
@@ -383,7 +458,6 @@ typedef int (nvmm_open_blk_fn)(struct nvm_dev *, struct nvm_block *);
 typedef int (nvmm_close_blk_fn)(struct nvm_dev *, struct nvm_block *);
 typedef void (nvmm_flush_blk_fn)(struct nvm_dev *, struct nvm_block *);
 typedef int (nvmm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
-typedef int (nvmm_end_io_fn)(struct nvm_rq *, int);
 typedef int (nvmm_erase_blk_fn)(struct nvm_dev *, struct nvm_block *,
                                                                unsigned long);
 typedef struct nvm_lun *(nvmm_get_lun_fn)(struct nvm_dev *, int);
@@ -397,6 +471,8 @@ struct nvmm_type {
        nvmm_unregister_fn *unregister_mgr;
 
        /* Block administration callbacks */
+       nvmm_get_blk_fn *get_blk_unlocked;
+       nvmm_put_blk_fn *put_blk_unlocked;
        nvmm_get_blk_fn *get_blk;
        nvmm_put_blk_fn *put_blk;
        nvmm_open_blk_fn *open_blk;
@@ -404,7 +480,6 @@ struct nvmm_type {
        nvmm_flush_blk_fn *flush_blk;
 
        nvmm_submit_io_fn *submit_io;
-       nvmm_end_io_fn *end_io;
        nvmm_erase_blk_fn *erase_blk;
 
        /* Configuration management */
@@ -418,6 +493,10 @@ struct nvmm_type {
 extern int nvm_register_mgr(struct nvmm_type *);
 extern void nvm_unregister_mgr(struct nvmm_type *);
 
+extern struct nvm_block *nvm_get_blk_unlocked(struct nvm_dev *,
+                                       struct nvm_lun *, unsigned long);
+extern void nvm_put_blk_unlocked(struct nvm_dev *, struct nvm_block *);
+
 extern struct nvm_block *nvm_get_blk(struct nvm_dev *, struct nvm_lun *,
                                                                unsigned long);
 extern void nvm_put_blk(struct nvm_dev *, struct nvm_block *);
@@ -427,7 +506,36 @@ extern int nvm_register(struct request_queue *, char *,
 extern void nvm_unregister(char *);
 
 extern int nvm_submit_io(struct nvm_dev *, struct nvm_rq *);
+extern void nvm_generic_to_addr_mode(struct nvm_dev *, struct nvm_rq *);
+extern void nvm_addr_to_generic_mode(struct nvm_dev *, struct nvm_rq *);
+extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *,
+                                                       struct ppa_addr *, int);
+extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *);
+extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr *, int);
 extern int nvm_erase_blk(struct nvm_dev *, struct nvm_block *);
+extern void nvm_end_io(struct nvm_rq *, int);
+extern int nvm_submit_ppa(struct nvm_dev *, struct ppa_addr *, int, int, int,
+                                                               void *, int);
+
+/* sysblk.c */
+#define NVM_SYSBLK_MAGIC 0x4E564D53 /* "NVMS" */
+
+/* system block on disk representation */
+struct nvm_system_block {
+       __be32                  magic;          /* magic signature */
+       __be32                  seqnr;          /* sequence number */
+       __be32                  erase_cnt;      /* erase count */
+       __be16                  version;        /* version number */
+       u8                      mmtype[NVM_MMTYPE_LEN]; /* media manager name */
+       __be64                  fs_ppa;         /* PPA for media manager
+                                                * superblock */
+};
+
+extern int nvm_get_sysblock(struct nvm_dev *, struct nvm_sb_info *);
+extern int nvm_update_sysblock(struct nvm_dev *, struct nvm_sb_info *);
+extern int nvm_init_sysblock(struct nvm_dev *, struct nvm_sb_info *);
+
+extern int nvm_dev_factory(struct nvm_dev *, int flags);
 #else /* CONFIG_NVM */
 struct nvm_dev_ops;
 
index 46262284de478196e4e907674de9e5e04f364d60..04fc6e6c7ff0a48c6d3a9f39e5fd090f8479e449 100644 (file)
@@ -264,7 +264,7 @@ extern unsigned int lc_put(struct lru_cache *lc, struct lc_element *e);
 extern void lc_committed(struct lru_cache *lc);
 
 struct seq_file;
-extern size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc);
+extern void lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc);
 
 extern void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext,
                                void (*detail) (struct seq_file *, struct lc_element *));
index 58391f2e0414e0e2001fab4c002771798fad3d33..116b284bc4ce81b77c5cd6f14266eb62c7ad4c03 100644 (file)
@@ -206,7 +206,8 @@ enum {
        MLX4_SET_PORT_GID_TABLE = 0x5,
        MLX4_SET_PORT_PRIO2TC   = 0x8,
        MLX4_SET_PORT_SCHEDULER = 0x9,
-       MLX4_SET_PORT_VXLAN     = 0xB
+       MLX4_SET_PORT_VXLAN     = 0xB,
+       MLX4_SET_PORT_ROCE_ADDR = 0xD
 };
 
 enum {
index d3133be12d922521e24528480edbdb31659a7007..430a929f048b3d2f27d1e842f45e80f9e9e6347a 100644 (file)
@@ -216,6 +216,7 @@ enum {
        MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN      = 1LL <<  30,
        MLX4_DEV_CAP_FLAG2_UPDATE_QP_SRC_CHECK_LB = 1ULL << 31,
        MLX4_DEV_CAP_FLAG2_LB_SRC_CHK           = 1ULL << 32,
+       MLX4_DEV_CAP_FLAG2_ROCE_V1_V2           = 1ULL <<  33,
 };
 
 enum {
@@ -267,12 +268,14 @@ enum {
        MLX4_BMME_FLAG_TYPE_2_WIN       = 1 <<  9,
        MLX4_BMME_FLAG_RESERVED_LKEY    = 1 << 10,
        MLX4_BMME_FLAG_FAST_REG_WR      = 1 << 11,
+       MLX4_BMME_FLAG_ROCE_V1_V2       = 1 << 19,
        MLX4_BMME_FLAG_PORT_REMAP       = 1 << 24,
        MLX4_BMME_FLAG_VSD_INIT2RTR     = 1 << 28,
 };
 
 enum {
-       MLX4_FLAG_PORT_REMAP            = MLX4_BMME_FLAG_PORT_REMAP
+       MLX4_FLAG_PORT_REMAP            = MLX4_BMME_FLAG_PORT_REMAP,
+       MLX4_FLAG_ROCE_V1_V2            = MLX4_BMME_FLAG_ROCE_V1_V2
 };
 
 enum mlx4_event {
@@ -979,14 +982,11 @@ struct mlx4_mad_ifc {
        for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++)     \
                if ((type) == (dev)->caps.port_mask[(port)])
 
-#define mlx4_foreach_non_ib_transport_port(port, dev)                     \
-       for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++)       \
-               if (((dev)->caps.port_mask[port] != MLX4_PORT_TYPE_IB))
-
 #define mlx4_foreach_ib_transport_port(port, dev)                         \
-       for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++)       \
+       for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++)       \
                if (((dev)->caps.port_mask[port] == MLX4_PORT_TYPE_IB) || \
-                       ((dev)->caps.flags & MLX4_DEV_CAP_FLAG_IBOE))
+                       ((dev)->caps.flags & MLX4_DEV_CAP_FLAG_IBOE) || \
+                       ((dev)->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2))
 
 #define MLX4_INVALID_SLAVE_ID  0xFF
 #define MLX4_SINK_COUNTER_INDEX(dev)   (dev->caps.max_counters - 1)
@@ -1457,6 +1457,7 @@ int mlx4_get_base_gid_ix(struct mlx4_dev *dev, int slave, int port);
 
 int mlx4_config_vxlan_port(struct mlx4_dev *dev, __be16 udp_port);
 int mlx4_disable_rx_port_check(struct mlx4_dev *dev, bool dis);
+int mlx4_config_roce_v2_port(struct mlx4_dev *dev, u16 udp_port);
 int mlx4_virt2phy_port_map(struct mlx4_dev *dev, u32 port1, u32 port2);
 int mlx4_vf_smi_enabled(struct mlx4_dev *dev, int slave, int port);
 int mlx4_vf_get_enable_smi_admin(struct mlx4_dev *dev, int slave, int port);
index fe052e234906da97e88206d0b05db2442bced61f..587cdf943b524abd88fa945844b1e5b209b7fe2e 100644 (file)
@@ -194,7 +194,7 @@ struct mlx4_qp_context {
        u8                      mtu_msgmax;
        u8                      rq_size_stride;
        u8                      sq_size_stride;
-       u8                      rlkey;
+       u8                      rlkey_roce_mode;
        __be32                  usr_page;
        __be32                  local_qpn;
        __be32                  remote_qpn;
@@ -204,7 +204,8 @@ struct mlx4_qp_context {
        u32                     reserved1;
        __be32                  next_send_psn;
        __be32                  cqn_send;
-       u32                     reserved2[2];
+       __be16                  roce_entropy;
+       __be16                  reserved2[3];
        __be32                  last_acked_psn;
        __be32                  ssn;
        __be32                  params2;
@@ -487,4 +488,14 @@ static inline struct mlx4_qp *__mlx4_qp_lookup(struct mlx4_dev *dev, u32 qpn)
 
 void mlx4_qp_remove(struct mlx4_dev *dev, struct mlx4_qp *qp);
 
+static inline u16 folded_qp(u32 q)
+{
+       u16 res;
+
+       res = ((q & 0xff) ^ ((q & 0xff0000) >> 16)) | (q & 0xff00);
+       return res;
+}
+
+u16 mlx4_qp_roce_entropy(struct mlx4_dev *dev, u32 qpn);
+
 #endif /* MLX4_QP_H */
index 7be845e306897ccd9d598ace7341976733164e5a..987764afa65c2344d5a85328f9f35a419e21db64 100644 (file)
@@ -223,6 +223,14 @@ enum {
 #define MLX5_UMR_MTT_MASK      (MLX5_UMR_MTT_ALIGNMENT - 1)
 #define MLX5_UMR_MTT_MIN_CHUNK_SIZE MLX5_UMR_MTT_ALIGNMENT
 
+#define MLX5_USER_INDEX_LEN (MLX5_FLD_SZ_BYTES(qpc, user_index) * 8)
+
+enum {
+       MLX5_EVENT_QUEUE_TYPE_QP = 0,
+       MLX5_EVENT_QUEUE_TYPE_RQ = 1,
+       MLX5_EVENT_QUEUE_TYPE_SQ = 2,
+};
+
 enum mlx5_event {
        MLX5_EVENT_TYPE_COMP               = 0x0,
 
@@ -279,6 +287,26 @@ enum {
        MLX5_DEV_CAP_FLAG_CMDIF_CSUM    = 3LL << 46,
 };
 
+enum {
+       MLX5_ROCE_VERSION_1             = 0,
+       MLX5_ROCE_VERSION_2             = 2,
+};
+
+enum {
+       MLX5_ROCE_VERSION_1_CAP         = 1 << MLX5_ROCE_VERSION_1,
+       MLX5_ROCE_VERSION_2_CAP         = 1 << MLX5_ROCE_VERSION_2,
+};
+
+enum {
+       MLX5_ROCE_L3_TYPE_IPV4          = 0,
+       MLX5_ROCE_L3_TYPE_IPV6          = 1,
+};
+
+enum {
+       MLX5_ROCE_L3_TYPE_IPV4_CAP      = 1 << 1,
+       MLX5_ROCE_L3_TYPE_IPV6_CAP      = 1 << 2,
+};
+
 enum {
        MLX5_OPCODE_NOP                 = 0x00,
        MLX5_OPCODE_SEND_INVAL          = 0x01,
@@ -446,7 +474,7 @@ struct mlx5_init_seg {
        __be32                  rsvd2[880];
        __be32                  internal_timer_h;
        __be32                  internal_timer_l;
-       __be32                  rsrv3[2];
+       __be32                  rsvd3[2];
        __be32                  health_counter;
        __be32                  rsvd4[1019];
        __be64                  ieee1588_clk;
@@ -460,7 +488,9 @@ struct mlx5_eqe_comp {
 };
 
 struct mlx5_eqe_qp_srq {
-       __be32  reserved[6];
+       __be32  reserved1[5];
+       u8      type;
+       u8      reserved2[3];
        __be32  qp_srq_n;
 };
 
@@ -650,6 +680,12 @@ enum {
        CQE_RSS_HTYPE_L4        = 0x3 << 2,
 };
 
+enum {
+       MLX5_CQE_ROCE_L3_HEADER_TYPE_GRH        = 0x0,
+       MLX5_CQE_ROCE_L3_HEADER_TYPE_IPV6       = 0x1,
+       MLX5_CQE_ROCE_L3_HEADER_TYPE_IPV4       = 0x2,
+};
+
 enum {
        CQE_L2_OK       = 1 << 0,
        CQE_L3_OK       = 1 << 1,
index 5162f3533042825d03ea5c6244d1735bd55a8663..1e3006dcf35d735175ebe054d8867f89b994231c 100644 (file)
@@ -115,6 +115,11 @@ enum {
        MLX5_REG_HOST_ENDIANNESS = 0x7004,
 };
 
+enum {
+       MLX5_ATOMIC_OPS_CMP_SWAP        = 1 << 0,
+       MLX5_ATOMIC_OPS_FETCH_ADD       = 1 << 1,
+};
+
 enum mlx5_page_fault_resume_flags {
        MLX5_PAGE_FAULT_RESUME_REQUESTOR = 1 << 0,
        MLX5_PAGE_FAULT_RESUME_WRITE     = 1 << 1,
@@ -341,9 +346,11 @@ struct mlx5_core_mr {
 };
 
 enum mlx5_res_type {
-       MLX5_RES_QP,
-       MLX5_RES_SRQ,
-       MLX5_RES_XSRQ,
+       MLX5_RES_QP     = MLX5_EVENT_QUEUE_TYPE_QP,
+       MLX5_RES_RQ     = MLX5_EVENT_QUEUE_TYPE_RQ,
+       MLX5_RES_SQ     = MLX5_EVENT_QUEUE_TYPE_SQ,
+       MLX5_RES_SRQ    = 3,
+       MLX5_RES_XSRQ   = 4,
 };
 
 struct mlx5_core_rsc_common {
@@ -651,13 +658,6 @@ extern struct workqueue_struct *mlx5_core_wq;
        .struct_offset_bytes = offsetof(struct ib_unpacked_ ## header, field),      \
        .struct_size_bytes   = sizeof((struct ib_unpacked_ ## header *)0)->field
 
-struct ib_field {
-       size_t struct_offset_bytes;
-       size_t struct_size_bytes;
-       int    offset_bits;
-       int    size_bits;
-};
-
 static inline struct mlx5_core_dev *pci2mlx5_core_dev(struct pci_dev *pdev)
 {
        return pci_get_drvdata(pdev);
index 68d73f82e009e60654952f237c0f9fed01bd67e5..231ab6bcea76356ecf6539513fc210fe1e208451 100644 (file)
@@ -66,6 +66,11 @@ enum {
        MLX5_MODIFY_TIR_BITMASK_TUNNELED_OFFLOAD_EN   = 0x3
 };
 
+enum {
+       MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE        = 0x0,
+       MLX5_SET_HCA_CAP_OP_MOD_ATOMIC                = 0x3,
+};
+
 enum {
        MLX5_CMD_OP_QUERY_HCA_CAP                 = 0x100,
        MLX5_CMD_OP_QUERY_ADAPTER                 = 0x101,
@@ -573,21 +578,24 @@ enum {
 struct mlx5_ifc_atomic_caps_bits {
        u8         reserved_0[0x40];
 
-       u8         atomic_req_endianness[0x1];
-       u8         reserved_1[0x1f];
+       u8         atomic_req_8B_endianess_mode[0x2];
+       u8         reserved_1[0x4];
+       u8         supported_atomic_req_8B_endianess_mode_1[0x1];
 
-       u8         reserved_2[0x20];
+       u8         reserved_2[0x19];
 
-       u8         reserved_3[0x10];
-       u8         atomic_operations[0x10];
+       u8         reserved_3[0x20];
 
        u8         reserved_4[0x10];
-       u8         atomic_size_qp[0x10];
+       u8         atomic_operations[0x10];
 
        u8         reserved_5[0x10];
+       u8         atomic_size_qp[0x10];
+
+       u8         reserved_6[0x10];
        u8         atomic_size_dc[0x10];
 
-       u8         reserved_6[0x720];
+       u8         reserved_7[0x720];
 };
 
 struct mlx5_ifc_odp_cap_bits {
@@ -850,7 +858,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
        u8         reserved_66[0x8];
        u8         log_uar_page_sz[0x10];
 
-       u8         reserved_67[0x40];
+       u8         reserved_67[0x20];
+       u8         device_frequency_mhz[0x20];
        u8         device_frequency_khz[0x20];
        u8         reserved_68[0x5f];
        u8         cqe_zip[0x1];
@@ -2215,19 +2224,25 @@ struct mlx5_ifc_nic_vport_context_bits {
 
        u8         mtu[0x10];
 
-       u8         reserved_3[0x640];
+       u8         system_image_guid[0x40];
+       u8         port_guid[0x40];
+       u8         node_guid[0x40];
+
+       u8         reserved_3[0x140];
+       u8         qkey_violation_counter[0x10];
+       u8         reserved_4[0x430];
 
        u8         promisc_uc[0x1];
        u8         promisc_mc[0x1];
        u8         promisc_all[0x1];
-       u8         reserved_4[0x2];
+       u8         reserved_5[0x2];
        u8         allowed_list_type[0x3];
-       u8         reserved_5[0xc];
+       u8         reserved_6[0xc];
        u8         allowed_list_size[0xc];
 
        struct mlx5_ifc_mac_address_layout_bits permanent_address;
 
-       u8         reserved_6[0x20];
+       u8         reserved_7[0x20];
 
        u8         current_uc_mac_address[0][0x40];
 };
@@ -4199,6 +4214,13 @@ struct mlx5_ifc_modify_tis_out_bits {
        u8         reserved_1[0x40];
 };
 
+struct mlx5_ifc_modify_tis_bitmask_bits {
+       u8         reserved_0[0x20];
+
+       u8         reserved_1[0x1f];
+       u8         prio[0x1];
+};
+
 struct mlx5_ifc_modify_tis_in_bits {
        u8         opcode[0x10];
        u8         reserved_0[0x10];
@@ -4211,7 +4233,7 @@ struct mlx5_ifc_modify_tis_in_bits {
 
        u8         reserved_3[0x20];
 
-       u8         modify_bitmask[0x40];
+       struct mlx5_ifc_modify_tis_bitmask_bits bitmask;
 
        u8         reserved_4[0x40];
 
index f079fb1a31f7f7953f0bb28f6f3a145279598dc7..5b8c89ffaa5830fdf05fbc40a1fce0b5d2ffa7f3 100644 (file)
@@ -85,7 +85,16 @@ enum mlx5_qp_state {
        MLX5_QP_STATE_ERR                       = 6,
        MLX5_QP_STATE_SQ_DRAINING               = 7,
        MLX5_QP_STATE_SUSPENDED                 = 9,
-       MLX5_QP_NUM_STATE
+       MLX5_QP_NUM_STATE,
+       MLX5_QP_STATE,
+       MLX5_QP_STATE_BAD,
+};
+
+enum {
+       MLX5_SQ_STATE_NA        = MLX5_SQC_STATE_ERR + 1,
+       MLX5_SQ_NUM_STATE       = MLX5_SQ_STATE_NA + 1,
+       MLX5_RQ_STATE_NA        = MLX5_RQC_STATE_ERR + 1,
+       MLX5_RQ_NUM_STATE       = MLX5_RQ_STATE_NA + 1,
 };
 
 enum {
@@ -130,6 +139,9 @@ enum {
        MLX5_QP_BIT_RWE                         = 1 << 14,
        MLX5_QP_BIT_RAE                         = 1 << 13,
        MLX5_QP_BIT_RIC                         = 1 <<  4,
+       MLX5_QP_BIT_CC_SLAVE_RECV               = 1 <<  2,
+       MLX5_QP_BIT_CC_SLAVE_SEND               = 1 <<  1,
+       MLX5_QP_BIT_CC_MASTER                   = 1 <<  0
 };
 
 enum {
@@ -248,8 +260,12 @@ struct mlx5_av {
        __be32  dqp_dct;
        u8      stat_rate_sl;
        u8      fl_mlid;
-       __be16  rlid;
-       u8      reserved0[10];
+       union {
+               __be16  rlid;
+               __be16  udp_sport;
+       };
+       u8      reserved0[4];
+       u8      rmac[6];
        u8      tclass;
        u8      hop_limit;
        __be32  grh_gid_fl;
@@ -456,11 +472,16 @@ struct mlx5_qp_path {
        u8                      static_rate;
        u8                      hop_limit;
        __be32                  tclass_flowlabel;
-       u8                      rgid[16];
-       u8                      rsvd1[4];
-       u8                      sl;
+       union {
+               u8              rgid[16];
+               u8              rip[16];
+       };
+       u8                      f_dscp_ecn_prio;
+       u8                      ecn_dscp;
+       __be16                  udp_sport;
+       u8                      dci_cfi_prio_sl;
        u8                      port;
-       u8                      rsvd2[6];
+       u8                      rmac[6];
 };
 
 struct mlx5_qp_context {
@@ -620,8 +641,7 @@ int mlx5_core_create_qp(struct mlx5_core_dev *dev,
                        struct mlx5_core_qp *qp,
                        struct mlx5_create_qp_mbox_in *in,
                        int inlen);
-int mlx5_core_qp_modify(struct mlx5_core_dev *dev, enum mlx5_qp_state cur_state,
-                       enum mlx5_qp_state new_state,
+int mlx5_core_qp_modify(struct mlx5_core_dev *dev, u16 operation,
                        struct mlx5_modify_qp_mbox_in *in, int sqd_event,
                        struct mlx5_core_qp *qp);
 int mlx5_core_destroy_qp(struct mlx5_core_dev *dev,
@@ -639,6 +659,14 @@ void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp);
 int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 qpn,
                                u8 context, int error);
 #endif
+int mlx5_core_create_rq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen,
+                               struct mlx5_core_qp *rq);
+void mlx5_core_destroy_rq_tracked(struct mlx5_core_dev *dev,
+                                 struct mlx5_core_qp *rq);
+int mlx5_core_create_sq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen,
+                               struct mlx5_core_qp *sq);
+void mlx5_core_destroy_sq_tracked(struct mlx5_core_dev *dev,
+                                 struct mlx5_core_qp *sq);
 
 static inline const char *mlx5_qp_type_str(int type)
 {
diff --git a/include/linux/mlx5/transobj.h b/include/linux/mlx5/transobj.h
new file mode 100644 (file)
index 0000000..88441f5
--- /dev/null
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __TRANSOBJ_H__
+#define __TRANSOBJ_H__
+
+#include <linux/mlx5/driver.h>
+
+int mlx5_core_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn);
+void mlx5_core_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn);
+int mlx5_core_create_rq(struct mlx5_core_dev *dev, u32 *in, int inlen,
+                       u32 *rqn);
+int mlx5_core_modify_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *in, int inlen);
+void mlx5_core_destroy_rq(struct mlx5_core_dev *dev, u32 rqn);
+int mlx5_core_query_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *out);
+int mlx5_core_create_sq(struct mlx5_core_dev *dev, u32 *in, int inlen,
+                       u32 *sqn);
+int mlx5_core_modify_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *in, int inlen);
+void mlx5_core_destroy_sq(struct mlx5_core_dev *dev, u32 sqn);
+int mlx5_core_query_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *out);
+int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, int inlen,
+                        u32 *tirn);
+int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in,
+                        int inlen);
+void mlx5_core_destroy_tir(struct mlx5_core_dev *dev, u32 tirn);
+int mlx5_core_create_tis(struct mlx5_core_dev *dev, u32 *in, int inlen,
+                        u32 *tisn);
+int mlx5_core_modify_tis(struct mlx5_core_dev *dev, u32 tisn, u32 *in,
+                        int inlen);
+void mlx5_core_destroy_tis(struct mlx5_core_dev *dev, u32 tisn);
+int mlx5_core_create_rmp(struct mlx5_core_dev *dev, u32 *in, int inlen,
+                        u32 *rmpn);
+int mlx5_core_modify_rmp(struct mlx5_core_dev *dev, u32 *in, int inlen);
+int mlx5_core_destroy_rmp(struct mlx5_core_dev *dev, u32 rmpn);
+int mlx5_core_query_rmp(struct mlx5_core_dev *dev, u32 rmpn, u32 *out);
+int mlx5_core_arm_rmp(struct mlx5_core_dev *dev, u32 rmpn, u16 lwm);
+int mlx5_core_create_xsrq(struct mlx5_core_dev *dev, u32 *in, int inlen,
+                         u32 *rmpn);
+int mlx5_core_destroy_xsrq(struct mlx5_core_dev *dev, u32 rmpn);
+int mlx5_core_query_xsrq(struct mlx5_core_dev *dev, u32 rmpn, u32 *out);
+int mlx5_core_arm_xsrq(struct mlx5_core_dev *dev, u32 rmpn, u16 lwm);
+
+int mlx5_core_create_rqt(struct mlx5_core_dev *dev, u32 *in, int inlen,
+                        u32 *rqtn);
+int mlx5_core_modify_rqt(struct mlx5_core_dev *dev, u32 rqtn, u32 *in,
+                        int inlen);
+void mlx5_core_destroy_rqt(struct mlx5_core_dev *dev, u32 rqtn);
+
+#endif /* __TRANSOBJ_H__ */
index 638f2ca7a527c01f63fe2c3b3cd8013fbaf70c86..123771003e68586571690f9a5211e5afaa0143e3 100644 (file)
@@ -45,6 +45,11 @@ int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev,
                                     u16 vport, u8 *addr);
 int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *dev,
                                      u16 vport, u8 *addr);
+int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev,
+                                          u64 *system_image_guid);
+int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid);
+int mlx5_query_nic_vport_qkey_viol_cntr(struct mlx5_core_dev *mdev,
+                                       u16 *qkey_viol_cntr);
 int mlx5_query_hca_vport_gid(struct mlx5_core_dev *dev, u8 other_vport,
                             u8 port_num, u16  vf_num, u16 gid_index,
                             union ib_gid *gid);
@@ -85,4 +90,7 @@ int mlx5_modify_nic_vport_vlans(struct mlx5_core_dev *dev,
                                u16 vlans[],
                                int list_size);
 
+int mlx5_nic_vport_enable_roce(struct mlx5_core_dev *mdev);
+int mlx5_nic_vport_disable_roce(struct mlx5_core_dev *mdev);
+
 #endif /* __MLX5_VPORT_H__ */
index 3af5f454c04ac17f3e45d2578519d9fe2f919f47..a55986f6fe38e087e9996415f6469db64d5a3238 100644 (file)
 
 #include <linux/types.h>
 
-struct nvme_bar {
-       __u64                   cap;    /* Controller Capabilities */
-       __u32                   vs;     /* Version */
-       __u32                   intms;  /* Interrupt Mask Set */
-       __u32                   intmc;  /* Interrupt Mask Clear */
-       __u32                   cc;     /* Controller Configuration */
-       __u32                   rsvd1;  /* Reserved */
-       __u32                   csts;   /* Controller Status */
-       __u32                   nssr;   /* Subsystem Reset */
-       __u32                   aqa;    /* Admin Queue Attributes */
-       __u64                   asq;    /* Admin SQ Base Address */
-       __u64                   acq;    /* Admin CQ Base Address */
-       __u32                   cmbloc; /* Controller Memory Buffer Location */
-       __u32                   cmbsz;  /* Controller Memory Buffer Size */
+enum {
+       NVME_REG_CAP    = 0x0000,       /* Controller Capabilities */
+       NVME_REG_VS     = 0x0008,       /* Version */
+       NVME_REG_INTMS  = 0x000c,       /* Interrupt Mask Set */
+       NVME_REG_INTMC  = 0x0010,       /* Interrupt Mask Set */
+       NVME_REG_CC     = 0x0014,       /* Controller Configuration */
+       NVME_REG_CSTS   = 0x001c,       /* Controller Status */
+       NVME_REG_NSSR   = 0x0020,       /* NVM Subsystem Reset */
+       NVME_REG_AQA    = 0x0024,       /* Admin Queue Attributes */
+       NVME_REG_ASQ    = 0x0028,       /* Admin SQ Base Address */
+       NVME_REG_ACQ    = 0x0030,       /* Admin SQ Base Address */
+       NVME_REG_CMBLOC = 0x0038,       /* Controller Memory Buffer Location */
+       NVME_REG_CMBSZ  = 0x003c,       /* Controller Memory Buffer Size */
 };
 
 #define NVME_CAP_MQES(cap)     ((cap) & 0xffff)
index 4d08b6c33557250edda8e949ff64f5f2fd1ffdc1..92395a0a7dc5c436cdd43317a798b6d15eba5a5e 100644 (file)
@@ -361,6 +361,9 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
                               unsigned int nr_pages, struct page **pages);
 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
                        int tag, unsigned int nr_pages, struct page **pages);
+unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
+                       int tag, unsigned int nr_entries,
+                       struct page **entries, pgoff_t *indices);
 
 struct page *grab_cache_page_write_begin(struct address_space *mapping,
                        pgoff_t index, unsigned flags);
index eb8b8ac6df3c844e2bd84903e0a50ff07f1575fe..24f5470d394460c779ebdbd385d8f14766f14001 100644 (file)
@@ -42,6 +42,7 @@ struct pipe_buffer {
  *     @fasync_readers: reader side fasync
  *     @fasync_writers: writer side fasync
  *     @bufs: the circular array of pipe buffers
+ *     @user: the user who created this pipe
  **/
 struct pipe_inode_info {
        struct mutex mutex;
@@ -57,6 +58,7 @@ struct pipe_inode_info {
        struct fasync_struct *fasync_readers;
        struct fasync_struct *fasync_writers;
        struct pipe_buffer *bufs;
+       struct user_struct *user;
 };
 
 /*
@@ -123,6 +125,8 @@ void pipe_unlock(struct pipe_inode_info *);
 void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *);
 
 extern unsigned int pipe_max_size, pipe_min_size;
+extern unsigned long pipe_user_pages_hard;
+extern unsigned long pipe_user_pages_soft;
 int pipe_proc_fn(struct ctl_table *, int, void __user *, size_t *, loff_t *);
 
 
index acfea8ce4a07be8b6291c76ca0f53728b06ae5fa..7c3d11a6b4ad5d8d35b68a33f87801865adada8d 100644 (file)
@@ -53,12 +53,18 @@ static inline void arch_clear_pmem(void __pmem *addr, size_t size)
 {
        BUG();
 }
+
+static inline void arch_wb_cache_pmem(void __pmem *addr, size_t size)
+{
+       BUG();
+}
 #endif
 
 /*
  * Architectures that define ARCH_HAS_PMEM_API must provide
  * implementations for arch_memcpy_to_pmem(), arch_wmb_pmem(),
- * arch_copy_from_iter_pmem(), arch_clear_pmem() and arch_has_wmb_pmem().
+ * arch_copy_from_iter_pmem(), arch_clear_pmem(), arch_wb_cache_pmem()
+ * and arch_has_wmb_pmem().
  */
 static inline void memcpy_from_pmem(void *dst, void __pmem const *src, size_t size)
 {
@@ -178,4 +184,18 @@ static inline void clear_pmem(void __pmem *addr, size_t size)
        else
                default_clear_pmem(addr, size);
 }
+
+/**
+ * wb_cache_pmem - write back processor cache for PMEM memory range
+ * @addr:      virtual start address
+ * @size:      number of bytes to write back
+ *
+ * Write back the processor cache range starting at 'addr' for 'size' bytes.
+ * This function requires explicit ordering with a wmb_pmem() call.
+ */
+static inline void wb_cache_pmem(void __pmem *addr, size_t size)
+{
+       if (arch_has_pmem_api())
+               arch_wb_cache_pmem(addr, size);
+}
 #endif /* __PMEM_H__ */
index 57e7d87d2d4c15de691d8fa3075f779ddbda5172..7c88ad156a293c0bedcea771b58f3113909b6532 100644 (file)
 #define RADIX_TREE_EXCEPTIONAL_ENTRY   2
 #define RADIX_TREE_EXCEPTIONAL_SHIFT   2
 
+#define RADIX_DAX_MASK 0xf
+#define RADIX_DAX_SHIFT        4
+#define RADIX_DAX_PTE  (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY)
+#define RADIX_DAX_PMD  (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY)
+#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK)
+#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
+#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
+               RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE)))
+
 static inline int radix_tree_is_indirect_ptr(void *ptr)
 {
        return (int)((unsigned long)ptr & RADIX_TREE_INDIRECT_PTR);
index f1e81e128592cbcf5b3c3e61949bcbe75b34c74f..a10494a94cc30f24d65ab866771833883c6f886d 100644 (file)
@@ -835,6 +835,7 @@ struct user_struct {
 #endif
        unsigned long locked_shm; /* How many pages of mlocked shm ? */
        unsigned long unix_inflight;    /* How many files in flight in unix sockets */
+       atomic_long_t pipe_bufs;  /* how many pages are allocated in pipe buffers */
 
 #ifdef CONFIG_KEYS
        struct key *uid_keyring;        /* UID specific keyring */
index a43f41cb3c430166afc7fda3f09571007fe74acf..4d4780c00d345e9e5fd50f5394b89d93c57c966b 100644 (file)
@@ -15,10 +15,7 @@ struct shmem_inode_info {
        unsigned int            seals;          /* shmem seals */
        unsigned long           flags;
        unsigned long           alloced;        /* data pages alloced to file */
-       union {
-               unsigned long   swapped;        /* subtotal assigned to swap */
-               char            *symlink;       /* unswappable short symlink */
-       };
+       unsigned long           swapped;        /* subtotal assigned to swap */
        struct shared_policy    policy;         /* NUMA memory alloc policy */
        struct list_head        swaplist;       /* chain of maybes on swap */
        struct simple_xattrs    xattrs;         /* list of xattrs */
index f869807a0d0e2ca93629a7d25092f268dbc8f520..5322fea6fe4c7995ae3ad0cfad15a1818356576f 100644 (file)
@@ -51,6 +51,7 @@
 /* RPC/RDMA parameters and stats */
 extern unsigned int svcrdma_ord;
 extern unsigned int svcrdma_max_requests;
+extern unsigned int svcrdma_max_bc_requests;
 extern unsigned int svcrdma_max_req_size;
 
 extern atomic_t rdma_stat_recv;
@@ -69,6 +70,7 @@ extern atomic_t rdma_stat_sq_prod;
  * completes.
  */
 struct svc_rdma_op_ctxt {
+       struct list_head free;
        struct svc_rdma_op_ctxt *read_hdr;
        struct svc_rdma_fastreg_mr *frmr;
        int hdr_count;
@@ -112,6 +114,7 @@ struct svc_rdma_fastreg_mr {
        struct list_head frmr_list;
 };
 struct svc_rdma_req_map {
+       struct list_head free;
        unsigned long count;
        union {
                struct kvec sge[RPCSVC_MAXPAGES];
@@ -132,28 +135,32 @@ struct svcxprt_rdma {
        int                  sc_max_sge;
        int                  sc_max_sge_rd;     /* max sge for read target */
 
-       int                  sc_sq_depth;       /* Depth of SQ */
        atomic_t             sc_sq_count;       /* Number of SQ WR on queue */
-
-       int                  sc_max_requests;   /* Depth of RQ */
+       unsigned int         sc_sq_depth;       /* Depth of SQ */
+       unsigned int         sc_rq_depth;       /* Depth of RQ */
+       u32                  sc_max_requests;   /* Forward credits */
+       u32                  sc_max_bc_requests;/* Backward credits */
        int                  sc_max_req_size;   /* Size of each RQ WR buf */
 
        struct ib_pd         *sc_pd;
 
        atomic_t             sc_dma_used;
-       atomic_t             sc_ctxt_used;
+       spinlock_t           sc_ctxt_lock;
+       struct list_head     sc_ctxts;
+       int                  sc_ctxt_used;
+       spinlock_t           sc_map_lock;
+       struct list_head     sc_maps;
+
        struct list_head     sc_rq_dto_q;
        spinlock_t           sc_rq_dto_lock;
        struct ib_qp         *sc_qp;
        struct ib_cq         *sc_rq_cq;
        struct ib_cq         *sc_sq_cq;
-       struct ib_mr         *sc_phys_mr;       /* MR for server memory */
        int                  (*sc_reader)(struct svcxprt_rdma *,
                                          struct svc_rqst *,
                                          struct svc_rdma_op_ctxt *,
                                          int *, u32 *, u32, u32, u64, bool);
        u32                  sc_dev_caps;       /* distilled device caps */
-       u32                  sc_dma_lkey;       /* local dma key */
        unsigned int         sc_frmr_pg_list_len;
        struct list_head     sc_frmr_q;
        spinlock_t           sc_frmr_q_lock;
@@ -179,8 +186,18 @@ struct svcxprt_rdma {
 #define RPCRDMA_MAX_REQUESTS    32
 #define RPCRDMA_MAX_REQ_SIZE    4096
 
+/* Typical ULP usage of BC requests is NFSv4.1 backchannel. Our
+ * current NFSv4.1 implementation supports one backchannel slot.
+ */
+#define RPCRDMA_MAX_BC_REQUESTS        2
+
 #define RPCSVC_MAXPAYLOAD_RDMA RPCSVC_MAXPAYLOAD
 
+/* svc_rdma_backchannel.c */
+extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt,
+                                   struct rpcrdma_msg *rmsgp,
+                                   struct xdr_buf *rcvbuf);
+
 /* svc_rdma_marshal.c */
 extern int svc_rdma_xdr_decode_req(struct rpcrdma_msg **, struct svc_rqst *);
 extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *,
@@ -206,6 +223,8 @@ extern int rdma_read_chunk_frmr(struct svcxprt_rdma *, struct svc_rqst *,
                                u32, u32, u64, bool);
 
 /* svc_rdma_sendto.c */
+extern int svc_rdma_map_xdr(struct svcxprt_rdma *, struct xdr_buf *,
+                           struct svc_rdma_req_map *);
 extern int svc_rdma_sendto(struct svc_rqst *);
 extern struct rpcrdma_read_chunk *
        svc_rdma_get_read_chunk(struct rpcrdma_msg *);
@@ -214,13 +233,14 @@ extern struct rpcrdma_read_chunk *
 extern int svc_rdma_send(struct svcxprt_rdma *, struct ib_send_wr *);
 extern void svc_rdma_send_error(struct svcxprt_rdma *, struct rpcrdma_msg *,
                                enum rpcrdma_errcode);
-extern int svc_rdma_post_recv(struct svcxprt_rdma *);
+extern int svc_rdma_post_recv(struct svcxprt_rdma *, gfp_t);
 extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *);
 extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *);
 extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int);
 extern void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt);
-extern struct svc_rdma_req_map *svc_rdma_get_req_map(void);
-extern void svc_rdma_put_req_map(struct svc_rdma_req_map *);
+extern struct svc_rdma_req_map *svc_rdma_get_req_map(struct svcxprt_rdma *);
+extern void svc_rdma_put_req_map(struct svcxprt_rdma *,
+                                struct svc_rdma_req_map *);
 extern struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *);
 extern void svc_rdma_put_frmr(struct svcxprt_rdma *,
                              struct svc_rdma_fastreg_mr *);
@@ -234,6 +254,7 @@ extern struct svc_xprt_class svc_rdma_bc_class;
 #endif
 
 /* svc_rdma.c */
+extern struct workqueue_struct *svc_rdma_wq;
 extern int svc_rdma_init(void);
 extern void svc_rdma_cleanup(void);
 
index 613c29bd6baf4763e57530b794df843d9cea61ee..e13a1ace50e902ad0ae38a81f5563d67eaa4a00f 100644 (file)
@@ -43,6 +43,9 @@
 /* Default weight of a bound cooling device */
 #define THERMAL_WEIGHT_DEFAULT 0
 
+/* use value, which < 0K, to indicate an invalid/uninitialized temperature */
+#define THERMAL_TEMP_INVALID   -274000
+
 /* Unit conversion macros */
 #define DECI_KELVIN_TO_CELSIUS(t)      ({                      \
        long _t = (t);                                          \
@@ -167,6 +170,7 @@ struct thermal_attr {
  * @forced_passive:    If > 0, temperature at which to switch on all ACPI
  *                     processor cooling devices.  Currently only used by the
  *                     step-wise governor.
+ * @need_update:       if equals 1, thermal_zone_device_update needs to be invoked.
  * @ops:       operations this &thermal_zone_device supports
  * @tzp:       thermal zone parameters
  * @governor:  pointer to the governor for this thermal zone
@@ -194,6 +198,7 @@ struct thermal_zone_device {
        int emul_temperature;
        int passive;
        unsigned int forced_passive;
+       atomic_t need_update;
        struct thermal_zone_device_ops *ops;
        struct thermal_zone_params *tzp;
        struct thermal_governor *governor;
index 11528591d0d714f3ea1004566cb799952a9ca5bd..c34c9002460c567a8e7e3e8cd9cb6acb751c97d8 100644 (file)
@@ -83,6 +83,8 @@ struct rdma_dev_addr {
        int bound_dev_if;
        enum rdma_transport_type transport;
        struct net *net;
+       enum rdma_network_type network;
+       int hoplimit;
 };
 
 /**
@@ -91,8 +93,8 @@ struct rdma_dev_addr {
  *
  * The dev_addr->net field must be initialized.
  */
-int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr,
-                     u16 *vlan_id);
+int rdma_translate_ip(const struct sockaddr *addr,
+                     struct rdma_dev_addr *dev_addr, u16 *vlan_id);
 
 /**
  * rdma_resolve_ip - Resolve source and destination IP addresses to
@@ -117,6 +119,10 @@ int rdma_resolve_ip(struct rdma_addr_client *client,
                                     struct rdma_dev_addr *addr, void *context),
                    void *context);
 
+int rdma_resolve_ip_route(struct sockaddr *src_addr,
+                         const struct sockaddr *dst_addr,
+                         struct rdma_dev_addr *addr);
+
 void rdma_addr_cancel(struct rdma_dev_addr *addr);
 
 int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev,
@@ -125,8 +131,10 @@ int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev,
 int rdma_addr_size(struct sockaddr *addr);
 
 int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id);
-int rdma_addr_find_dmac_by_grh(const union ib_gid *sgid, const union ib_gid *dgid,
-                              u8 *smac, u16 *vlan_id, int if_index);
+int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid,
+                                const union ib_gid *dgid,
+                                u8 *smac, u16 *vlan_id, int *if_index,
+                                int *hoplimit);
 
 static inline u16 ib_addr_get_pkey(struct rdma_dev_addr *dev_addr)
 {
index 269a27cf0a46f37c7fd9fdc3fc24df2d5070c2d2..e30f19bd4a41ef6900cda863c99f3b238a30cd47 100644 (file)
@@ -60,6 +60,7 @@ int ib_get_cached_gid(struct ib_device    *device,
  *   a specified GID value occurs.
  * @device: The device to query.
  * @gid: The GID value to search for.
+ * @gid_type: The GID type to search for.
  * @ndev: In RoCE, the net device of the device. NULL means ignore.
  * @port_num: The port number of the device where the GID value was found.
  * @index: The index into the cached GID table where the GID was found.  This
@@ -70,6 +71,7 @@ int ib_get_cached_gid(struct ib_device    *device,
  */
 int ib_find_cached_gid(struct ib_device *device,
                       const union ib_gid *gid,
+                      enum ib_gid_type gid_type,
                       struct net_device *ndev,
                       u8               *port_num,
                       u16              *index);
@@ -79,6 +81,7 @@ int ib_find_cached_gid(struct ib_device *device,
  * GID value occurs
  * @device: The device to query.
  * @gid: The GID value to search for.
+ * @gid_type: The GID type to search for.
  * @port_num: The port number of the device where the GID value sould be
  *   searched.
  * @ndev: In RoCE, the net device of the device. Null means ignore.
@@ -90,6 +93,7 @@ int ib_find_cached_gid(struct ib_device *device,
  */
 int ib_find_cached_gid_by_port(struct ib_device *device,
                               const union ib_gid *gid,
+                              enum ib_gid_type gid_type,
                               u8               port_num,
                               struct net_device *ndev,
                               u16              *index);
index ec9b44dd3d806b20d80920a0cb37487ea2d12c24..0ff049bd9ad413c47b92aaaf116a43d1738e4b32 100644 (file)
@@ -438,6 +438,7 @@ typedef void (*ib_mad_snoop_handler)(struct ib_mad_agent *mad_agent,
 /**
  * ib_mad_recv_handler - callback handler for a received MAD.
  * @mad_agent: MAD agent requesting the received MAD.
+ * @send_buf: Send buffer if found, else NULL
  * @mad_recv_wc: Received work completion information on the received MAD.
  *
  * MADs received in response to a send request operation will be handed to
@@ -447,6 +448,7 @@ typedef void (*ib_mad_snoop_handler)(struct ib_mad_agent *mad_agent,
  * modify the data referenced by @mad_recv_wc.
  */
 typedef void (*ib_mad_recv_handler)(struct ib_mad_agent *mad_agent,
+                                   struct ib_mad_send_buf *send_buf,
                                    struct ib_mad_recv_wc *mad_recv_wc);
 
 /**
index e99d8f9a4551d9889501c645e04139fd0f32a3f8..0f3daae44bf9bf5b440c9d15d6a4d90544d29b88 100644 (file)
@@ -41,6 +41,8 @@ enum {
        IB_ETH_BYTES  = 14,
        IB_VLAN_BYTES = 4,
        IB_GRH_BYTES  = 40,
+       IB_IP4_BYTES  = 20,
+       IB_UDP_BYTES  = 8,
        IB_BTH_BYTES  = 12,
        IB_DETH_BYTES = 8
 };
@@ -223,6 +225,27 @@ struct ib_unpacked_eth {
        __be16  type;
 };
 
+struct ib_unpacked_ip4 {
+       u8      ver;
+       u8      hdr_len;
+       u8      tos;
+       __be16  tot_len;
+       __be16  id;
+       __be16  frag_off;
+       u8      ttl;
+       u8      protocol;
+       __sum16 check;
+       __be32  saddr;
+       __be32  daddr;
+};
+
+struct ib_unpacked_udp {
+       __be16  sport;
+       __be16  dport;
+       __be16  length;
+       __be16  csum;
+};
+
 struct ib_unpacked_vlan {
        __be16  tag;
        __be16  type;
@@ -237,6 +260,10 @@ struct ib_ud_header {
        struct ib_unpacked_vlan vlan;
        int                     grh_present;
        struct ib_unpacked_grh  grh;
+       int                     ipv4_present;
+       struct ib_unpacked_ip4  ip4;
+       int                     udp_present;
+       struct ib_unpacked_udp  udp;
        struct ib_unpacked_bth  bth;
        struct ib_unpacked_deth deth;
        int                     immediate_present;
@@ -253,13 +280,17 @@ void ib_unpack(const struct ib_field        *desc,
               void                         *buf,
               void                         *structure);
 
-void ib_ud_header_init(int                 payload_bytes,
-                      int                  lrh_present,
-                      int                  eth_present,
-                      int                  vlan_present,
-                      int                  grh_present,
-                      int                  immediate_present,
-                      struct ib_ud_header *header);
+__sum16 ib_ud_ip4_csum(struct ib_ud_header *header);
+
+int ib_ud_header_init(int                  payload_bytes,
+                     int                   lrh_present,
+                     int                   eth_present,
+                     int                   vlan_present,
+                     int                   grh_present,
+                     int                   ip_version,
+                     int                   udp_present,
+                     int                   immediate_present,
+                     struct ib_ud_header *header);
 
 int ib_ud_header_pack(struct ib_ud_header *header,
                      void                *buf);
index a5889f18807b62f6f2d46f01a4650b5876647051..2f8a65c1fca7b77bc03c20e3b886036448645ad1 100644 (file)
@@ -42,6 +42,7 @@
  */
 #define IB_PMA_CLASS_CAP_ALLPORTSELECT  cpu_to_be16(1 << 8)
 #define IB_PMA_CLASS_CAP_EXT_WIDTH      cpu_to_be16(1 << 9)
+#define IB_PMA_CLASS_CAP_EXT_WIDTH_NOIETF cpu_to_be16(1 << 10)
 #define IB_PMA_CLASS_CAP_XMIT_WAIT      cpu_to_be16(1 << 12)
 
 #define IB_PMA_CLASS_PORT_INFO          cpu_to_be16(0x0001)
index 301969552d0a51e34dcd872daa303a6339721916..cdc1c81aa275bda38f97f1e71c3c774dfbc8c983 100644 (file)
@@ -160,6 +160,7 @@ struct ib_sa_path_rec {
        int          ifindex;
        /* ignored in IB */
        struct net  *net;
+       enum ib_gid_type gid_type;
 };
 
 static inline struct net_device *ib_get_ndev_from_path(struct ib_sa_path_rec *rec)
@@ -402,6 +403,8 @@ int ib_sa_get_mcmember_rec(struct ib_device *device, u8 port_num,
  */
 int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num,
                             struct ib_sa_mcmember_rec *rec,
+                            struct net_device *ndev,
+                            enum ib_gid_type gid_type,
                             struct ib_ah_attr *ah_attr);
 
 /**
index 120da1d7f57eb578c327507ad2affa5b89488dbb..284b00c8fea4a4f67943aa7979d1b51728e5732e 100644 (file)
 #include <linux/scatterlist.h>
 #include <linux/workqueue.h>
 #include <linux/socket.h>
+#include <linux/irq_poll.h>
 #include <uapi/linux/if_ether.h>
+#include <net/ipv6.h>
+#include <net/ip.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include <linux/atomic.h>
 #include <linux/mmu_notifier.h>
 #include <asm/uaccess.h>
 
 extern struct workqueue_struct *ib_wq;
+extern struct workqueue_struct *ib_comp_wq;
 
 union ib_gid {
        u8      raw[16];
@@ -67,7 +73,17 @@ union ib_gid {
 
 extern union ib_gid zgid;
 
+enum ib_gid_type {
+       /* If link layer is Ethernet, this is RoCE V1 */
+       IB_GID_TYPE_IB        = 0,
+       IB_GID_TYPE_ROCE      = 0,
+       IB_GID_TYPE_ROCE_UDP_ENCAP = 1,
+       IB_GID_TYPE_SIZE
+};
+
+#define ROCE_V2_UDP_DPORT      4791
 struct ib_gid_attr {
+       enum ib_gid_type        gid_type;
        struct net_device       *ndev;
 };
 
@@ -98,6 +114,35 @@ enum rdma_protocol_type {
 __attribute_const__ enum rdma_transport_type
 rdma_node_get_transport(enum rdma_node_type node_type);
 
+enum rdma_network_type {
+       RDMA_NETWORK_IB,
+       RDMA_NETWORK_ROCE_V1 = RDMA_NETWORK_IB,
+       RDMA_NETWORK_IPV4,
+       RDMA_NETWORK_IPV6
+};
+
+static inline enum ib_gid_type ib_network_to_gid_type(enum rdma_network_type network_type)
+{
+       if (network_type == RDMA_NETWORK_IPV4 ||
+           network_type == RDMA_NETWORK_IPV6)
+               return IB_GID_TYPE_ROCE_UDP_ENCAP;
+
+       /* IB_GID_TYPE_IB same as RDMA_NETWORK_ROCE_V1 */
+       return IB_GID_TYPE_IB;
+}
+
+static inline enum rdma_network_type ib_gid_to_network_type(enum ib_gid_type gid_type,
+                                                           union ib_gid *gid)
+{
+       if (gid_type == IB_GID_TYPE_IB)
+               return RDMA_NETWORK_IB;
+
+       if (ipv6_addr_v4mapped((struct in6_addr *)gid))
+               return RDMA_NETWORK_IPV4;
+       else
+               return RDMA_NETWORK_IPV6;
+}
+
 enum rdma_link_layer {
        IB_LINK_LAYER_UNSPECIFIED,
        IB_LINK_LAYER_INFINIBAND,
@@ -105,24 +150,32 @@ enum rdma_link_layer {
 };
 
 enum ib_device_cap_flags {
-       IB_DEVICE_RESIZE_MAX_WR         = 1,
-       IB_DEVICE_BAD_PKEY_CNTR         = (1<<1),
-       IB_DEVICE_BAD_QKEY_CNTR         = (1<<2),
-       IB_DEVICE_RAW_MULTI             = (1<<3),
-       IB_DEVICE_AUTO_PATH_MIG         = (1<<4),
-       IB_DEVICE_CHANGE_PHY_PORT       = (1<<5),
-       IB_DEVICE_UD_AV_PORT_ENFORCE    = (1<<6),
-       IB_DEVICE_CURR_QP_STATE_MOD     = (1<<7),
-       IB_DEVICE_SHUTDOWN_PORT         = (1<<8),
-       IB_DEVICE_INIT_TYPE             = (1<<9),
-       IB_DEVICE_PORT_ACTIVE_EVENT     = (1<<10),
-       IB_DEVICE_SYS_IMAGE_GUID        = (1<<11),
-       IB_DEVICE_RC_RNR_NAK_GEN        = (1<<12),
-       IB_DEVICE_SRQ_RESIZE            = (1<<13),
-       IB_DEVICE_N_NOTIFY_CQ           = (1<<14),
-       IB_DEVICE_LOCAL_DMA_LKEY        = (1<<15),
-       IB_DEVICE_RESERVED              = (1<<16), /* old SEND_W_INV */
-       IB_DEVICE_MEM_WINDOW            = (1<<17),
+       IB_DEVICE_RESIZE_MAX_WR                 = (1 << 0),
+       IB_DEVICE_BAD_PKEY_CNTR                 = (1 << 1),
+       IB_DEVICE_BAD_QKEY_CNTR                 = (1 << 2),
+       IB_DEVICE_RAW_MULTI                     = (1 << 3),
+       IB_DEVICE_AUTO_PATH_MIG                 = (1 << 4),
+       IB_DEVICE_CHANGE_PHY_PORT               = (1 << 5),
+       IB_DEVICE_UD_AV_PORT_ENFORCE            = (1 << 6),
+       IB_DEVICE_CURR_QP_STATE_MOD             = (1 << 7),
+       IB_DEVICE_SHUTDOWN_PORT                 = (1 << 8),
+       IB_DEVICE_INIT_TYPE                     = (1 << 9),
+       IB_DEVICE_PORT_ACTIVE_EVENT             = (1 << 10),
+       IB_DEVICE_SYS_IMAGE_GUID                = (1 << 11),
+       IB_DEVICE_RC_RNR_NAK_GEN                = (1 << 12),
+       IB_DEVICE_SRQ_RESIZE                    = (1 << 13),
+       IB_DEVICE_N_NOTIFY_CQ                   = (1 << 14),
+
+       /*
+        * This device supports a per-device lkey or stag that can be
+        * used without performing a memory registration for the local
+        * memory.  Note that ULPs should never check this flag, but
+        * instead of use the local_dma_lkey flag in the ib_pd structure,
+        * which will always contain a usable lkey.
+        */
+       IB_DEVICE_LOCAL_DMA_LKEY                = (1 << 15),
+       IB_DEVICE_RESERVED /* old SEND_W_INV */ = (1 << 16),
+       IB_DEVICE_MEM_WINDOW                    = (1 << 17),
        /*
         * Devices should set IB_DEVICE_UD_IP_SUM if they support
         * insertion of UDP and TCP checksum on outgoing UD IPoIB
@@ -130,18 +183,35 @@ enum ib_device_cap_flags {
         * incoming messages.  Setting this flag implies that the
         * IPoIB driver may set NETIF_F_IP_CSUM for datagram mode.
         */
-       IB_DEVICE_UD_IP_CSUM            = (1<<18),
-       IB_DEVICE_UD_TSO                = (1<<19),
-       IB_DEVICE_XRC                   = (1<<20),
-       IB_DEVICE_MEM_MGT_EXTENSIONS    = (1<<21),
-       IB_DEVICE_BLOCK_MULTICAST_LOOPBACK = (1<<22),
-       IB_DEVICE_MEM_WINDOW_TYPE_2A    = (1<<23),
-       IB_DEVICE_MEM_WINDOW_TYPE_2B    = (1<<24),
-       IB_DEVICE_RC_IP_CSUM            = (1<<25),
-       IB_DEVICE_RAW_IP_CSUM           = (1<<26),
-       IB_DEVICE_MANAGED_FLOW_STEERING = (1<<29),
-       IB_DEVICE_SIGNATURE_HANDOVER    = (1<<30),
-       IB_DEVICE_ON_DEMAND_PAGING      = (1<<31),
+       IB_DEVICE_UD_IP_CSUM                    = (1 << 18),
+       IB_DEVICE_UD_TSO                        = (1 << 19),
+       IB_DEVICE_XRC                           = (1 << 20),
+
+       /*
+        * This device supports the IB "base memory management extension",
+        * which includes support for fast registrations (IB_WR_REG_MR,
+        * IB_WR_LOCAL_INV and IB_WR_SEND_WITH_INV verbs).  This flag should
+        * also be set by any iWarp device which must support FRs to comply
+        * to the iWarp verbs spec.  iWarp devices also support the
+        * IB_WR_RDMA_READ_WITH_INV verb for RDMA READs that invalidate the
+        * stag.
+        */
+       IB_DEVICE_MEM_MGT_EXTENSIONS            = (1 << 21),
+       IB_DEVICE_BLOCK_MULTICAST_LOOPBACK      = (1 << 22),
+       IB_DEVICE_MEM_WINDOW_TYPE_2A            = (1 << 23),
+       IB_DEVICE_MEM_WINDOW_TYPE_2B            = (1 << 24),
+       IB_DEVICE_RC_IP_CSUM                    = (1 << 25),
+       IB_DEVICE_RAW_IP_CSUM                   = (1 << 26),
+       /*
+        * Devices should set IB_DEVICE_CROSS_CHANNEL if they
+        * support execution of WQEs that involve synchronization
+        * of I/O operations with single completion queue managed
+        * by hardware.
+        */
+       IB_DEVICE_CROSS_CHANNEL         = (1 << 27),
+       IB_DEVICE_MANAGED_FLOW_STEERING         = (1 << 29),
+       IB_DEVICE_SIGNATURE_HANDOVER            = (1 << 30),
+       IB_DEVICE_ON_DEMAND_PAGING              = (1 << 31),
 };
 
 enum ib_signature_prot_cap {
@@ -184,6 +254,7 @@ struct ib_odp_caps {
 
 enum ib_cq_creation_flags {
        IB_CQ_FLAGS_TIMESTAMP_COMPLETION   = 1 << 0,
+       IB_CQ_FLAGS_IGNORE_OVERRUN         = 1 << 1,
 };
 
 struct ib_cq_init_attr {
@@ -393,6 +464,7 @@ union rdma_protocol_stats {
 #define RDMA_CORE_CAP_PROT_IB           0x00100000
 #define RDMA_CORE_CAP_PROT_ROCE         0x00200000
 #define RDMA_CORE_CAP_PROT_IWARP        0x00400000
+#define RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP 0x00800000
 
 #define RDMA_CORE_PORT_IBA_IB          (RDMA_CORE_CAP_PROT_IB  \
                                        | RDMA_CORE_CAP_IB_MAD \
@@ -405,6 +477,12 @@ union rdma_protocol_stats {
                                        | RDMA_CORE_CAP_IB_CM   \
                                        | RDMA_CORE_CAP_AF_IB   \
                                        | RDMA_CORE_CAP_ETH_AH)
+#define RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP                      \
+                                       (RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP \
+                                       | RDMA_CORE_CAP_IB_MAD  \
+                                       | RDMA_CORE_CAP_IB_CM   \
+                                       | RDMA_CORE_CAP_AF_IB   \
+                                       | RDMA_CORE_CAP_ETH_AH)
 #define RDMA_CORE_PORT_IWARP           (RDMA_CORE_CAP_PROT_IWARP \
                                        | RDMA_CORE_CAP_IW_CM)
 #define RDMA_CORE_PORT_INTEL_OPA       (RDMA_CORE_PORT_IBA_IB  \
@@ -519,6 +597,17 @@ struct ib_grh {
        union ib_gid    dgid;
 };
 
+union rdma_network_hdr {
+       struct ib_grh ibgrh;
+       struct {
+               /* The IB spec states that if it's IPv4, the header
+                * is located in the last 20 bytes of the header.
+                */
+               u8              reserved[20];
+               struct iphdr    roce4grh;
+       };
+};
+
 enum {
        IB_MULTICAST_QPN = 0xffffff
 };
@@ -734,7 +823,6 @@ enum ib_wc_opcode {
        IB_WC_RDMA_READ,
        IB_WC_COMP_SWAP,
        IB_WC_FETCH_ADD,
-       IB_WC_BIND_MW,
        IB_WC_LSO,
        IB_WC_LOCAL_INV,
        IB_WC_REG_MR,
@@ -755,10 +843,14 @@ enum ib_wc_flags {
        IB_WC_IP_CSUM_OK        = (1<<3),
        IB_WC_WITH_SMAC         = (1<<4),
        IB_WC_WITH_VLAN         = (1<<5),
+       IB_WC_WITH_NETWORK_HDR_TYPE     = (1<<6),
 };
 
 struct ib_wc {
-       u64                     wr_id;
+       union {
+               u64             wr_id;
+               struct ib_cqe   *wr_cqe;
+       };
        enum ib_wc_status       status;
        enum ib_wc_opcode       opcode;
        u32                     vendor_err;
@@ -777,6 +869,7 @@ struct ib_wc {
        u8                      port_num;       /* valid only for DR SMPs on switches */
        u8                      smac[ETH_ALEN];
        u16                     vlan_id;
+       u8                      network_hdr_type;
 };
 
 enum ib_cq_notify_flags {
@@ -866,6 +959,9 @@ enum ib_qp_type {
 enum ib_qp_create_flags {
        IB_QP_CREATE_IPOIB_UD_LSO               = 1 << 0,
        IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK   = 1 << 1,
+       IB_QP_CREATE_CROSS_CHANNEL              = 1 << 2,
+       IB_QP_CREATE_MANAGED_SEND               = 1 << 3,
+       IB_QP_CREATE_MANAGED_RECV               = 1 << 4,
        IB_QP_CREATE_NETIF_QP                   = 1 << 5,
        IB_QP_CREATE_SIGNATURE_EN               = 1 << 6,
        IB_QP_CREATE_USE_GFP_NOIO               = 1 << 7,
@@ -1027,7 +1123,6 @@ enum ib_wr_opcode {
        IB_WR_REG_MR,
        IB_WR_MASKED_ATOMIC_CMP_AND_SWP,
        IB_WR_MASKED_ATOMIC_FETCH_AND_ADD,
-       IB_WR_BIND_MW,
        IB_WR_REG_SIG_MR,
        /* reserve values for low level drivers' internal use.
         * These values will not be used at all in the ib core layer.
@@ -1062,26 +1157,16 @@ struct ib_sge {
        u32     lkey;
 };
 
-/**
- * struct ib_mw_bind_info - Parameters for a memory window bind operation.
- * @mr: A memory region to bind the memory window to.
- * @addr: The address where the memory window should begin.
- * @length: The length of the memory window, in bytes.
- * @mw_access_flags: Access flags from enum ib_access_flags for the window.
- *
- * This struct contains the shared parameters for type 1 and type 2
- * memory window bind operations.
- */
-struct ib_mw_bind_info {
-       struct ib_mr   *mr;
-       u64             addr;
-       u64             length;
-       int             mw_access_flags;
+struct ib_cqe {
+       void (*done)(struct ib_cq *cq, struct ib_wc *wc);
 };
 
 struct ib_send_wr {
        struct ib_send_wr      *next;
-       u64                     wr_id;
+       union {
+               u64             wr_id;
+               struct ib_cqe   *wr_cqe;
+       };
        struct ib_sge          *sg_list;
        int                     num_sge;
        enum ib_wr_opcode       opcode;
@@ -1147,19 +1232,6 @@ static inline struct ib_reg_wr *reg_wr(struct ib_send_wr *wr)
        return container_of(wr, struct ib_reg_wr, wr);
 }
 
-struct ib_bind_mw_wr {
-       struct ib_send_wr       wr;
-       struct ib_mw            *mw;
-       /* The new rkey for the memory window. */
-       u32                     rkey;
-       struct ib_mw_bind_info  bind_info;
-};
-
-static inline struct ib_bind_mw_wr *bind_mw_wr(struct ib_send_wr *wr)
-{
-       return container_of(wr, struct ib_bind_mw_wr, wr);
-}
-
 struct ib_sig_handover_wr {
        struct ib_send_wr       wr;
        struct ib_sig_attrs    *sig_attrs;
@@ -1175,7 +1247,10 @@ static inline struct ib_sig_handover_wr *sig_handover_wr(struct ib_send_wr *wr)
 
 struct ib_recv_wr {
        struct ib_recv_wr      *next;
-       u64                     wr_id;
+       union {
+               u64             wr_id;
+               struct ib_cqe   *wr_cqe;
+       };
        struct ib_sge          *sg_list;
        int                     num_sge;
 };
@@ -1190,20 +1265,10 @@ enum ib_access_flags {
        IB_ACCESS_ON_DEMAND     = (1<<6),
 };
 
-struct ib_phys_buf {
-       u64      addr;
-       u64      size;
-};
-
-struct ib_mr_attr {
-       struct ib_pd    *pd;
-       u64             device_virt_addr;
-       u64             size;
-       int             mr_access_flags;
-       u32             lkey;
-       u32             rkey;
-};
-
+/*
+ * XXX: these are apparently used for ->rereg_user_mr, no idea why they
+ * are hidden here instead of a uapi header!
+ */
 enum ib_mr_rereg_flags {
        IB_MR_REREG_TRANS       = 1,
        IB_MR_REREG_PD          = (1<<1),
@@ -1211,18 +1276,6 @@ enum ib_mr_rereg_flags {
        IB_MR_REREG_SUPPORTED   = ((IB_MR_REREG_ACCESS << 1) - 1)
 };
 
-/**
- * struct ib_mw_bind - Parameters for a type 1 memory window bind operation.
- * @wr_id:      Work request id.
- * @send_flags: Flags from ib_send_flags enum.
- * @bind_info:  More parameters of the bind operation.
- */
-struct ib_mw_bind {
-       u64                    wr_id;
-       int                    send_flags;
-       struct ib_mw_bind_info bind_info;
-};
-
 struct ib_fmr_attr {
        int     max_pages;
        int     max_maps;
@@ -1307,6 +1360,12 @@ struct ib_ah {
 
 typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context);
 
+enum ib_poll_context {
+       IB_POLL_DIRECT,         /* caller context, no hw completions */
+       IB_POLL_SOFTIRQ,        /* poll from softirq context */
+       IB_POLL_WORKQUEUE,      /* poll from workqueue */
+};
+
 struct ib_cq {
        struct ib_device       *device;
        struct ib_uobject      *uobject;
@@ -1315,6 +1374,12 @@ struct ib_cq {
        void                   *cq_context;
        int                     cqe;
        atomic_t                usecnt; /* count number of work queues */
+       enum ib_poll_context    poll_ctx;
+       struct ib_wc            *wc;
+       union {
+               struct irq_poll         iop;
+               struct work_struct      work;
+       };
 };
 
 struct ib_srq {
@@ -1363,7 +1428,6 @@ struct ib_mr {
        u64                iova;
        u32                length;
        unsigned int       page_size;
-       atomic_t           usecnt; /* count number of MWs */
 };
 
 struct ib_mw {
@@ -1724,11 +1788,6 @@ struct ib_device {
                                                      int wc_cnt);
        struct ib_mr *             (*get_dma_mr)(struct ib_pd *pd,
                                                 int mr_access_flags);
-       struct ib_mr *             (*reg_phys_mr)(struct ib_pd *pd,
-                                                 struct ib_phys_buf *phys_buf_array,
-                                                 int num_phys_buf,
-                                                 int mr_access_flags,
-                                                 u64 *iova_start);
        struct ib_mr *             (*reg_user_mr)(struct ib_pd *pd,
                                                  u64 start, u64 length,
                                                  u64 virt_addr,
@@ -1741,8 +1800,6 @@ struct ib_device {
                                                    int mr_access_flags,
                                                    struct ib_pd *pd,
                                                    struct ib_udata *udata);
-       int                        (*query_mr)(struct ib_mr *mr,
-                                              struct ib_mr_attr *mr_attr);
        int                        (*dereg_mr)(struct ib_mr *mr);
        struct ib_mr *             (*alloc_mr)(struct ib_pd *pd,
                                               enum ib_mr_type mr_type,
@@ -1750,18 +1807,8 @@ struct ib_device {
        int                        (*map_mr_sg)(struct ib_mr *mr,
                                                struct scatterlist *sg,
                                                int sg_nents);
-       int                        (*rereg_phys_mr)(struct ib_mr *mr,
-                                                   int mr_rereg_mask,
-                                                   struct ib_pd *pd,
-                                                   struct ib_phys_buf *phys_buf_array,
-                                                   int num_phys_buf,
-                                                   int mr_access_flags,
-                                                   u64 *iova_start);
        struct ib_mw *             (*alloc_mw)(struct ib_pd *pd,
                                               enum ib_mw_type type);
-       int                        (*bind_mw)(struct ib_qp *qp,
-                                             struct ib_mw *mw,
-                                             struct ib_mw_bind *mw_bind);
        int                        (*dealloc_mw)(struct ib_mw *mw);
        struct ib_fmr *            (*alloc_fmr)(struct ib_pd *pd,
                                                int mr_access_flags,
@@ -1823,6 +1870,7 @@ struct ib_device {
        u16                          is_switch:1;
        u8                           node_type;
        u8                           phys_port_cnt;
+       struct ib_device_attr        attrs;
 
        /**
         * The following mandatory functions are used only at device
@@ -1888,6 +1936,31 @@ static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len
        return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0;
 }
 
+static inline bool ib_is_udata_cleared(struct ib_udata *udata,
+                                      size_t offset,
+                                      size_t len)
+{
+       const void __user *p = udata->inbuf + offset;
+       bool ret = false;
+       u8 *buf;
+
+       if (len > USHRT_MAX)
+               return false;
+
+       buf = kmalloc(len, GFP_KERNEL);
+       if (!buf)
+               return false;
+
+       if (copy_from_user(buf, p, len))
+               goto free;
+
+       ret = !memchr_inv(buf, 0, len);
+
+free:
+       kfree(buf);
+       return ret;
+}
+
 /**
  * ib_modify_qp_is_ok - Check that the supplied attribute mask
  * contains all required attributes and no attributes not allowed for
@@ -1912,9 +1985,6 @@ int ib_register_event_handler  (struct ib_event_handler *event_handler);
 int ib_unregister_event_handler(struct ib_event_handler *event_handler);
 void ib_dispatch_event(struct ib_event *event);
 
-int ib_query_device(struct ib_device *device,
-                   struct ib_device_attr *device_attr);
-
 int ib_query_port(struct ib_device *device,
                  u8 port_num, struct ib_port_attr *port_attr);
 
@@ -1967,6 +2037,17 @@ static inline bool rdma_protocol_ib(const struct ib_device *device, u8 port_num)
 }
 
 static inline bool rdma_protocol_roce(const struct ib_device *device, u8 port_num)
+{
+       return device->port_immutable[port_num].core_cap_flags &
+               (RDMA_CORE_CAP_PROT_ROCE | RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP);
+}
+
+static inline bool rdma_protocol_roce_udp_encap(const struct ib_device *device, u8 port_num)
+{
+       return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP;
+}
+
+static inline bool rdma_protocol_roce_eth_encap(const struct ib_device *device, u8 port_num)
 {
        return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_ROCE;
 }
@@ -1978,8 +2059,8 @@ static inline bool rdma_protocol_iwarp(const struct ib_device *device, u8 port_n
 
 static inline bool rdma_ib_or_roce(const struct ib_device *device, u8 port_num)
 {
-       return device->port_immutable[port_num].core_cap_flags &
-               (RDMA_CORE_CAP_PROT_IB | RDMA_CORE_CAP_PROT_ROCE);
+       return rdma_protocol_ib(device, port_num) ||
+               rdma_protocol_roce(device, port_num);
 }
 
 /**
@@ -2220,7 +2301,8 @@ int ib_modify_port(struct ib_device *device,
                   struct ib_port_modify *port_modify);
 
 int ib_find_gid(struct ib_device *device, union ib_gid *gid,
-               struct net_device *ndev, u8 *port_num, u16 *index);
+               enum ib_gid_type gid_type, struct net_device *ndev,
+               u8 *port_num, u16 *index);
 
 int ib_find_pkey(struct ib_device *device,
                 u8 port_num, u16 pkey, u16 *index);
@@ -2454,6 +2536,11 @@ static inline int ib_post_recv(struct ib_qp *qp,
        return qp->device->post_recv(qp, recv_wr, bad_recv_wr);
 }
 
+struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private,
+               int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx);
+void ib_free_cq(struct ib_cq *cq);
+int ib_process_cq_direct(struct ib_cq *cq, int budget);
+
 /**
  * ib_create_cq - Creates a CQ on the specified device.
  * @device: The device on which to create the CQ.
@@ -2838,13 +2925,6 @@ static inline void ib_dma_free_coherent(struct ib_device *dev,
                dma_free_coherent(dev->dma_device, size, cpu_addr, dma_handle);
 }
 
-/**
- * ib_query_mr - Retrieves information about a specific memory region.
- * @mr: The memory region to retrieve information about.
- * @mr_attr: The attributes of the specified memory region.
- */
-int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr);
-
 /**
  * ib_dereg_mr - Deregisters a memory region and removes it from the
  *   HCA translation table.
@@ -2881,42 +2961,6 @@ static inline u32 ib_inc_rkey(u32 rkey)
        return ((rkey + 1) & mask) | (rkey & ~mask);
 }
 
-/**
- * ib_alloc_mw - Allocates a memory window.
- * @pd: The protection domain associated with the memory window.
- * @type: The type of the memory window (1 or 2).
- */
-struct ib_mw *ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type);
-
-/**
- * ib_bind_mw - Posts a work request to the send queue of the specified
- *   QP, which binds the memory window to the given address range and
- *   remote access attributes.
- * @qp: QP to post the bind work request on.
- * @mw: The memory window to bind.
- * @mw_bind: Specifies information about the memory window, including
- *   its address range, remote access rights, and associated memory region.
- *
- * If there is no immediate error, the function will update the rkey member
- * of the mw parameter to its new value. The bind operation can still fail
- * asynchronously.
- */
-static inline int ib_bind_mw(struct ib_qp *qp,
-                            struct ib_mw *mw,
-                            struct ib_mw_bind *mw_bind)
-{
-       /* XXX reference counting in corresponding MR? */
-       return mw->device->bind_mw ?
-               mw->device->bind_mw(qp, mw, mw_bind) :
-               -ENOSYS;
-}
-
-/**
- * ib_dealloc_mw - Deallocates a memory window.
- * @mw: The memory window to deallocate.
- */
-int ib_dealloc_mw(struct ib_mw *mw);
-
 /**
  * ib_alloc_fmr - Allocates a unmapped fast memory region.
  * @pd: The protection domain associated with the unmapped region.
diff --git a/include/scsi/iser.h b/include/scsi/iser.h
new file mode 100644 (file)
index 0000000..2e678fa
--- /dev/null
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2015 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     - Redistributions of source code must retain the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer.
+ *
+ *     - Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ISCSI_ISER_H
+#define ISCSI_ISER_H
+
+#define ISER_ZBVA_NOT_SUP              0x80
+#define ISER_SEND_W_INV_NOT_SUP                0x40
+#define ISERT_ZBVA_NOT_USED            0x80
+#define ISERT_SEND_W_INV_NOT_USED      0x40
+
+#define ISCSI_CTRL     0x10
+#define ISER_HELLO     0x20
+#define ISER_HELLORPLY 0x30
+
+#define ISER_VER       0x10
+#define ISER_WSV       0x08
+#define ISER_RSV       0x04
+
+/**
+ * struct iser_cm_hdr - iSER CM header (from iSER Annex A12)
+ *
+ * @flags:        flags support (zbva, send_w_inv)
+ * @rsvd:         reserved
+ */
+struct iser_cm_hdr {
+       u8      flags;
+       u8      rsvd[3];
+} __packed;
+
+/**
+ * struct iser_ctrl - iSER header of iSCSI control PDU
+ *
+ * @flags:        opcode and read/write valid bits
+ * @rsvd:         reserved
+ * @write_stag:   write rkey
+ * @write_va:     write virtual address
+ * @reaf_stag:    read rkey
+ * @read_va:      read virtual address
+ */
+struct iser_ctrl {
+       u8      flags;
+       u8      rsvd[3];
+       __be32  write_stag;
+       __be64  write_va;
+       __be32  read_stag;
+       __be64  read_va;
+} __packed;
+
+#endif /* ISCSI_ISER_H */
index 7990469a44ce3df7909bdd447cd743019f900f37..c4d76ff056c6efd3431a4ed7aa3c6f6888c69e80 100644 (file)
@@ -104,6 +104,7 @@ struct snd_timer_instance {
                           int event,
                           struct timespec * tstamp,
                           unsigned long resolution);
+       void (*disconnect)(struct snd_timer_instance *timeri);
        void *callback_data;
        unsigned long ticks;            /* auto-load ticks when expired */
        unsigned long cticks;           /* current ticks */
index 594b4b29a2241ba0493e22e09b0b66ea2d6c3cc4..4e4b2fa786097c3638e78a6df76995e598a4cecf 100644 (file)
@@ -43,7 +43,7 @@ struct extent_status;
        { EXT4_GET_BLOCKS_METADATA_NOFAIL,      "METADATA_NOFAIL" },    \
        { EXT4_GET_BLOCKS_NO_NORMALIZE,         "NO_NORMALIZE" },       \
        { EXT4_GET_BLOCKS_KEEP_SIZE,            "KEEP_SIZE" },          \
-       { EXT4_GET_BLOCKS_NO_LOCK,              "NO_LOCK" })
+       { EXT4_GET_BLOCKS_ZERO,                 "ZERO" })
 
 #define show_mflags(flags) __print_flags(flags, "",    \
        { EXT4_MAP_NEW,         "N" },                  \
index 0f803d2783e3834194cb3f5cd3dd4da5c017304c..47c6212d8f3cecb07d62cb9ea81f6ea65ae0c35f 100644 (file)
@@ -46,10 +46,10 @@ SCAN_STATUS
 
 TRACE_EVENT(mm_khugepaged_scan_pmd,
 
-       TP_PROTO(struct mm_struct *mm, unsigned long pfn, bool writable,
+       TP_PROTO(struct mm_struct *mm, struct page *page, bool writable,
                 bool referenced, int none_or_zero, int status),
 
-       TP_ARGS(mm, pfn, writable, referenced, none_or_zero, status),
+       TP_ARGS(mm, page, writable, referenced, none_or_zero, status),
 
        TP_STRUCT__entry(
                __field(struct mm_struct *, mm)
@@ -62,7 +62,7 @@ TRACE_EVENT(mm_khugepaged_scan_pmd,
 
        TP_fast_assign(
                __entry->mm = mm;
-               __entry->pfn = pfn;
+               __entry->pfn = page ? page_to_pfn(page) : -1;
                __entry->writable = writable;
                __entry->referenced = referenced;
                __entry->none_or_zero = none_or_zero;
@@ -104,10 +104,10 @@ TRACE_EVENT(mm_collapse_huge_page,
 
 TRACE_EVENT(mm_collapse_huge_page_isolate,
 
-       TP_PROTO(unsigned long pfn, int none_or_zero,
+       TP_PROTO(struct page *page, int none_or_zero,
                 bool referenced, bool  writable, int status),
 
-       TP_ARGS(pfn, none_or_zero, referenced, writable, status),
+       TP_ARGS(page, none_or_zero, referenced, writable, status),
 
        TP_STRUCT__entry(
                __field(unsigned long, pfn)
@@ -118,7 +118,7 @@ TRACE_EVENT(mm_collapse_huge_page_isolate,
        ),
 
        TP_fast_assign(
-               __entry->pfn = pfn;
+               __entry->pfn = page ? page_to_pfn(page) : -1;
                __entry->none_or_zero = none_or_zero;
                __entry->referenced = referenced;
                __entry->writable = writable;
index ff8f6c091a1504e3d18937244a07b6432c880fb2..f95f25e786ef0de423a3a036a1685ad53da496af 100644 (file)
@@ -15,7 +15,7 @@ struct softirq_action;
                         softirq_name(NET_TX)           \
                         softirq_name(NET_RX)           \
                         softirq_name(BLOCK)            \
-                        softirq_name(BLOCK_IOPOLL)     \
+                        softirq_name(IRQ_POLL)         \
                         softirq_name(TASKLET)          \
                         softirq_name(SCHED)            \
                         softirq_name(HRTIMER)          \
index c2e5d6cb34e36ba2e409985c7d66c70347dbebc7..ebd10e6245984fb5c38b6ee13c3861b8e78ed040 100644 (file)
@@ -307,7 +307,7 @@ header-y += nfs_mount.h
 header-y += nl80211.h
 header-y += n_r3964.h
 header-y += nubus.h
-header-y += nvme.h
+header-y += nvme_ioctl.h
 header-y += nvram.h
 header-y += omap3isp.h
 header-y += omapfb.h
index b8a5b3b86ad63ef0825e9a9aea1784bcc68fe425..41e0433b4a8398b3dc3da6d2d6aacb8e46287150 100644 (file)
@@ -2,8 +2,11 @@
 #define _UAPI_LINUX_FS_H
 
 /*
- * This file has definitions for some important file table
- * structures etc.
+ * This file has definitions for some important file table structures
+ * and constants and structures used by various generic file system
+ * ioctl's.  Please do not make any changes in this file before
+ * sending patches for review to linux-fsdevel@vger.kernel.org and
+ * linux-api@vger.kernel.org.
  */
 
 #include <linux/limits.h>
@@ -146,6 +149,37 @@ struct inodes_stat_t {
 #define MS_MGC_VAL 0xC0ED0000
 #define MS_MGC_MSK 0xffff0000
 
+/*
+ * Structure for FS_IOC_FSGETXATTR[A] and FS_IOC_FSSETXATTR.
+ */
+struct fsxattr {
+       __u32           fsx_xflags;     /* xflags field value (get/set) */
+       __u32           fsx_extsize;    /* extsize field value (get/set)*/
+       __u32           fsx_nextents;   /* nextents field value (get)   */
+       __u32           fsx_projid;     /* project identifier (get/set) */
+       unsigned char   fsx_pad[12];
+};
+
+/*
+ * Flags for the fsx_xflags field
+ */
+#define FS_XFLAG_REALTIME      0x00000001      /* data in realtime volume */
+#define FS_XFLAG_PREALLOC      0x00000002      /* preallocated file extents */
+#define FS_XFLAG_IMMUTABLE     0x00000008      /* file cannot be modified */
+#define FS_XFLAG_APPEND                0x00000010      /* all writes append */
+#define FS_XFLAG_SYNC          0x00000020      /* all writes synchronous */
+#define FS_XFLAG_NOATIME       0x00000040      /* do not update access time */
+#define FS_XFLAG_NODUMP                0x00000080      /* do not include in backups */
+#define FS_XFLAG_RTINHERIT     0x00000100      /* create with rt bit set */
+#define FS_XFLAG_PROJINHERIT   0x00000200      /* create with parents projid */
+#define FS_XFLAG_NOSYMLINKS    0x00000400      /* disallow symlink creation */
+#define FS_XFLAG_EXTSIZE       0x00000800      /* extent size allocator hint */
+#define FS_XFLAG_EXTSZINHERIT  0x00001000      /* inherit inode extent size */
+#define FS_XFLAG_NODEFRAG      0x00002000      /* do not defragment */
+#define FS_XFLAG_FILESTREAM    0x00004000      /* use filestream allocator */
+#define FS_XFLAG_DAX           0x00008000      /* use DAX for IO */
+#define FS_XFLAG_HASATTR       0x80000000      /* no DIFLAG for this   */
+
 /* the read-only stuff doesn't really belong here, but any other place is
    probably as bad and I don't want to create yet another include file. */
 
@@ -210,9 +244,28 @@ struct inodes_stat_t {
 #define FS_IOC32_SETFLAGS              _IOW('f', 2, int)
 #define FS_IOC32_GETVERSION            _IOR('v', 1, int)
 #define FS_IOC32_SETVERSION            _IOW('v', 2, int)
+#define FS_IOC_FSGETXATTR              _IOR ('X', 31, struct fsxattr)
+#define FS_IOC_FSSETXATTR              _IOW ('X', 32, struct fsxattr)
 
 /*
  * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS)
+ *
+ * Note: for historical reasons, these flags were originally used and
+ * defined for use by ext2/ext3, and then other file systems started
+ * using these flags so they wouldn't need to write their own version
+ * of chattr/lsattr (which was shipped as part of e2fsprogs).  You
+ * should think twice before trying to use these flags in new
+ * contexts, or trying to assign these flags, since they are used both
+ * as the UAPI and the on-disk encoding for ext2/3/4.  Also, we are
+ * almost out of 32-bit flags.  :-)
+ *
+ * We have recently hoisted FS_IOC_FSGETXATTR / FS_IOC_FSSETXATTR from
+ * XFS to the generic FS level interface.  This uses a structure that
+ * has padding and hence has more room to grow, so it may be more
+ * appropriate for many new use cases.
+ *
+ * Please do not change these flags or interfaces before checking with
+ * linux-fsdevel@vger.kernel.org and linux-api@vger.kernel.org.
  */
 #define        FS_SECRM_FL                     0x00000001 /* Secure deletion */
 #define        FS_UNRM_FL                      0x00000002 /* Undelete */
@@ -226,8 +279,8 @@ struct inodes_stat_t {
 #define FS_DIRTY_FL                    0x00000100
 #define FS_COMPRBLK_FL                 0x00000200 /* One or more compressed clusters */
 #define FS_NOCOMP_FL                   0x00000400 /* Don't compress */
-#define FS_ECOMPR_FL                   0x00000800 /* Compression error */
 /* End compression flags --- maybe not all used */
+#define FS_ENCRYPT_FL                  0x00000800 /* Encrypted file */
 #define FS_BTREE_FL                    0x00001000 /* btree format dir */
 #define FS_INDEX_FL                    0x00001000 /* hash-indexed directory */
 #define FS_IMAGIC_FL                   0x00002000 /* AFS directory */
@@ -235,9 +288,12 @@ struct inodes_stat_t {
 #define FS_NOTAIL_FL                   0x00008000 /* file tail should not be merged */
 #define FS_DIRSYNC_FL                  0x00010000 /* dirsync behaviour (directories only) */
 #define FS_TOPDIR_FL                   0x00020000 /* Top of directory hierarchies*/
+#define FS_HUGE_FILE_FL                        0x00040000 /* Reserved for ext4 */
 #define FS_EXTENT_FL                   0x00080000 /* Extents */
-#define FS_DIRECTIO_FL                 0x00100000 /* Use direct i/o */
+#define FS_EA_INODE_FL                 0x00200000 /* Inode used for large EA */
+#define FS_EOFBLOCKS_FL                        0x00400000 /* Reserved for ext4 */
 #define FS_NOCOW_FL                    0x00800000 /* Do not cow file */
+#define FS_INLINE_DATA_FL              0x10000000 /* Reserved for ext4 */
 #define FS_PROJINHERIT_FL              0x20000000 /* Create with parents projid */
 #define FS_RESERVED_FL                 0x80000000 /* reserved for ext2 lib */
 
index 928f98997d8a1811bfe31bfae00c3be12cbdce74..774a43128a7aa4c039576513e1e3ff7355d0fc0d 100644 (file)
@@ -33,6 +33,7 @@
 
 #define NVM_TTYPE_NAME_MAX 48
 #define NVM_TTYPE_MAX 63
+#define NVM_MMTYPE_LEN 8
 
 #define NVM_CTRL_FILE "/dev/lightnvm/control"
 
@@ -100,6 +101,26 @@ struct nvm_ioctl_remove {
        __u32 flags;
 };
 
+struct nvm_ioctl_dev_init {
+       char dev[DISK_NAME_LEN];                /* open-channel SSD device */
+       char mmtype[NVM_MMTYPE_LEN];            /* register to media manager */
+
+       __u32 flags;
+};
+
+enum {
+       NVM_FACTORY_ERASE_ONLY_USER     = 1 << 0, /* erase only blocks used as
+                                                  * host blks or grown blks */
+       NVM_FACTORY_RESET_HOST_BLKS     = 1 << 1, /* remove host blk marks */
+       NVM_FACTORY_RESET_GRWN_BBLKS    = 1 << 2, /* remove grown blk marks */
+       NVM_FACTORY_NR_BITS             = 1 << 3, /* stops here */
+};
+
+struct nvm_ioctl_dev_factory {
+       char dev[DISK_NAME_LEN];
+
+       __u32 flags;
+};
 
 /* The ioctl type, 'L', 0x20 - 0x2F documented in ioctl-number.txt */
 enum {
@@ -110,6 +131,12 @@ enum {
        /* device level cmds */
        NVM_DEV_CREATE_CMD,
        NVM_DEV_REMOVE_CMD,
+
+       /* Init a device to support LightNVM media managers */
+       NVM_DEV_INIT_CMD,
+
+       /* Factory reset device */
+       NVM_DEV_FACTORY_CMD,
 };
 
 #define NVM_IOCTL 'L' /* 0x4c */
@@ -122,6 +149,10 @@ enum {
                                                struct nvm_ioctl_create)
 #define NVM_DEV_REMOVE         _IOW(NVM_IOCTL, NVM_DEV_REMOVE_CMD, \
                                                struct nvm_ioctl_remove)
+#define NVM_DEV_INIT           _IOW(NVM_IOCTL, NVM_DEV_INIT_CMD, \
+                                               struct nvm_ioctl_dev_init)
+#define NVM_DEV_FACTORY                _IOW(NVM_IOCTL, NVM_DEV_FACTORY_CMD, \
+                                               struct nvm_ioctl_dev_factory)
 
 #define NVM_VERSION_MAJOR      1
 #define NVM_VERSION_MINOR      0
index c33e1c489eb2af5b05e1fa00a5daf33caed99c26..8b8cfadf7833b6f2f29f4c290711d36726b107f9 100644 (file)
 typedef uint16_t blkif_vdev_t;
 typedef uint64_t blkif_sector_t;
 
+/*
+ * Multiple hardware queues/rings:
+ * If supported, the backend will write the key "multi-queue-max-queues" to
+ * the directory for that vbd, and set its value to the maximum supported
+ * number of queues.
+ * Frontends that are aware of this feature and wish to use it can write the
+ * key "multi-queue-num-queues" with the number they wish to use, which must be
+ * greater than zero, and no more than the value reported by the backend in
+ * "multi-queue-max-queues".
+ *
+ * For frontends requesting just one queue, the usual event-channel and
+ * ring-ref keys are written as before, simplifying the backend processing
+ * to avoid distinguishing between a frontend that doesn't understand the
+ * multi-queue feature, and one that does, but requested only one queue.
+ *
+ * Frontends requesting two or more queues must not write the toplevel
+ * event-channel and ring-ref keys, instead writing those keys under sub-keys
+ * having the name "queue-N" where N is the integer ID of the queue/ring for
+ * which those keys belong. Queues are indexed from zero.
+ * For example, a frontend with two queues must write the following set of
+ * queue-related keys:
+ *
+ * /local/domain/1/device/vbd/0/multi-queue-num-queues = "2"
+ * /local/domain/1/device/vbd/0/queue-0 = ""
+ * /local/domain/1/device/vbd/0/queue-0/ring-ref = "<ring-ref#0>"
+ * /local/domain/1/device/vbd/0/queue-0/event-channel = "<evtchn#0>"
+ * /local/domain/1/device/vbd/0/queue-1 = ""
+ * /local/domain/1/device/vbd/0/queue-1/ring-ref = "<ring-ref#1>"
+ * /local/domain/1/device/vbd/0/queue-1/event-channel = "<evtchn#1>"
+ *
+ * It is also possible to use multiple queues/rings together with
+ * feature multi-page ring buffer.
+ * For example, a frontend requests two queues/rings and the size of each ring
+ * buffer is two pages must write the following set of related keys:
+ *
+ * /local/domain/1/device/vbd/0/multi-queue-num-queues = "2"
+ * /local/domain/1/device/vbd/0/ring-page-order = "1"
+ * /local/domain/1/device/vbd/0/queue-0 = ""
+ * /local/domain/1/device/vbd/0/queue-0/ring-ref0 = "<ring-ref#0>"
+ * /local/domain/1/device/vbd/0/queue-0/ring-ref1 = "<ring-ref#1>"
+ * /local/domain/1/device/vbd/0/queue-0/event-channel = "<evtchn#0>"
+ * /local/domain/1/device/vbd/0/queue-1 = ""
+ * /local/domain/1/device/vbd/0/queue-1/ring-ref0 = "<ring-ref#2>"
+ * /local/domain/1/device/vbd/0/queue-1/ring-ref1 = "<ring-ref#3>"
+ * /local/domain/1/device/vbd/0/queue-1/event-channel = "<evtchn#1>"
+ *
+ */
+
 /*
  * REQUEST CODES.
  */
index f4617cf07069213262f5c1c8bfb097ca7989b8ad..781c1399c6a3b0fa0b81c9567dd3096276a17d65 100644 (file)
@@ -795,7 +795,7 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, umode_t, mode,
 
        ro = mnt_want_write(mnt);       /* we'll drop it in any case */
        error = 0;
-       mutex_lock(&d_inode(root)->i_mutex);
+       inode_lock(d_inode(root));
        path.dentry = lookup_one_len(name->name, root, strlen(name->name));
        if (IS_ERR(path.dentry)) {
                error = PTR_ERR(path.dentry);
@@ -841,7 +841,7 @@ out_putfd:
                put_unused_fd(fd);
                fd = error;
        }
-       mutex_unlock(&d_inode(root)->i_mutex);
+       inode_unlock(d_inode(root));
        if (!ro)
                mnt_drop_write(mnt);
 out_putname:
@@ -866,7 +866,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
        err = mnt_want_write(mnt);
        if (err)
                goto out_name;
-       mutex_lock_nested(&d_inode(mnt->mnt_root)->i_mutex, I_MUTEX_PARENT);
+       inode_lock_nested(d_inode(mnt->mnt_root), I_MUTEX_PARENT);
        dentry = lookup_one_len(name->name, mnt->mnt_root,
                                strlen(name->name));
        if (IS_ERR(dentry)) {
@@ -884,7 +884,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
        dput(dentry);
 
 out_unlock:
-       mutex_unlock(&d_inode(mnt->mnt_root)->i_mutex);
+       inode_unlock(d_inode(mnt->mnt_root));
        if (inode)
                iput(inode);
        mnt_drop_write(mnt);
index b471e5a3863ddbca70f2bf4dee22f40df0345fbe..cddd5b5fde514a205ed24a46b7a273643ab62c7c 100644 (file)
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -1493,7 +1493,7 @@ out_rcu_wakeup:
        wake_up_sem_queue_do(&tasks);
 out_free:
        if (sem_io != fast_sem_io)
-               ipc_free(sem_io, sizeof(ushort)*nsems);
+               ipc_free(sem_io);
        return err;
 }
 
index 0f401d94b7c657d5e7126fe78f149c94ffea8e24..798cad18dd878f2ee81f03e34431dbc001e2c2a9 100644 (file)
@@ -414,17 +414,12 @@ void *ipc_alloc(int size)
 /**
  * ipc_free - free ipc space
  * @ptr: pointer returned by ipc_alloc
- * @size: size of block
  *
- * Free a block created with ipc_alloc(). The caller must know the size
- * used in the allocation call.
+ * Free a block created with ipc_alloc().
  */
-void ipc_free(void *ptr, int size)
+void ipc_free(void *ptr)
 {
-       if (size > PAGE_SIZE)
-               vfree(ptr);
-       else
-               kfree(ptr);
+       kvfree(ptr);
 }
 
 /**
index 3a8a5a0eca6252f04cf188fa6921291f9ec6fc31..51f7ca58ac6742989c61c0e0fc8d73bc7eab6bf7 100644 (file)
@@ -118,7 +118,7 @@ int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flg);
  * both function can sleep
  */
 void *ipc_alloc(int size);
-void ipc_free(void *ptr, int size);
+void ipc_free(void *ptr);
 
 /*
  * For allocation that need to be freed by RCU.
index 27c6046c2c3d459c1f2345b191e647b94fa7f02e..f84f8d06e1f6d1010b9e61065c71596f320e28d9 100644 (file)
@@ -95,7 +95,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa
        if (IS_ERR(dentry))
                return (void *)dentry; /* returning an error */
        inode = path.dentry->d_inode;
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        audit_mark = kzalloc(sizeof(*audit_mark), GFP_KERNEL);
        if (unlikely(!audit_mark)) {
index 656c7e93ac0d30d3e42a8f7e0dfc7dd071360d78..9f194aad0adc0600ce257581315824f540eb9c0b 100644 (file)
@@ -364,7 +364,7 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent)
        struct dentry *d = kern_path_locked(watch->path, parent);
        if (IS_ERR(d))
                return PTR_ERR(d);
-       mutex_unlock(&d_backing_inode(parent->dentry)->i_mutex);
+       inode_unlock(d_backing_inode(parent->dentry));
        if (d_is_positive(d)) {
                /* update watch filter fields */
                watch->dev = d_backing_inode(d)->i_sb->s_dev;
index c0957416b32edd7f945201839c3058128bd718f5..06ae52e99ac255fcf4ec7925205742b85426e487 100644 (file)
@@ -4872,9 +4872,9 @@ static int perf_fasync(int fd, struct file *filp, int on)
        struct perf_event *event = filp->private_data;
        int retval;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        retval = fasync_helper(fd, filp, on, &event->fasync);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        if (retval < 0)
                return retval;
index 0b4570cfacaeb2f5290d8f4b98d256943d22333c..074994bcfa9be14336ecb06912a969f6d9546c9d 100644 (file)
@@ -1133,7 +1133,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
        if (!desc->count)
                return 0;
 
-       mutex_lock(&file_inode(filp)->i_mutex);
+       inode_lock(file_inode(filp));
        do {
                if (!relay_file_read_avail(buf, *ppos))
                        break;
@@ -1153,7 +1153,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
                        *ppos = relay_file_read_end_pos(buf, read_start, ret);
                }
        } while (desc->count && ret);
-       mutex_unlock(&file_inode(filp)->i_mutex);
+       inode_unlock(file_inode(filp));
 
        return desc->written;
 }
index 44253adb3c36dc28fb84cf9fdda4609f59dc388c..63d3a24e081ad311b59152d884209c7d1aa81c70 100644 (file)
@@ -222,9 +222,9 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
 
        /* Ensure the static_key remains in a consistent state */
        inode = file_inode(filp);
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        i = sched_feat_set(cmp);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        if (i == __SCHED_FEAT_NR)
                return -EINVAL;
 
index 91420362e0b339fcdfe930066dab863604357b13..97715fd9e790ade5d7cd7731107de2dafb8272e3 100644 (file)
@@ -1757,6 +1757,20 @@ static struct ctl_table fs_table[] = {
                .proc_handler   = &pipe_proc_fn,
                .extra1         = &pipe_min_size,
        },
+       {
+               .procname       = "pipe-user-pages-hard",
+               .data           = &pipe_user_pages_hard,
+               .maxlen         = sizeof(pipe_user_pages_hard),
+               .mode           = 0644,
+               .proc_handler   = proc_doulongvec_minmax,
+       },
+       {
+               .procname       = "pipe-user-pages-soft",
+               .data           = &pipe_user_pages_soft,
+               .maxlen         = sizeof(pipe_user_pages_soft),
+               .mode           = 0644,
+               .proc_handler   = proc_doulongvec_minmax,
+       },
        { }
 };
 
index 5a0c1c83cdf0af97ea9446657b3527dc1e206d45..133ebc0c17735bf14be352776609bab732347f9d 100644 (file)
@@ -210,9 +210,11 @@ config RANDOM32_SELFTEST
 # compression support is select'ed if needed
 #
 config 842_COMPRESS
+       select CRC32
        tristate
 
 config 842_DECOMPRESS
+       select CRC32
        tristate
 
 config ZLIB_INFLATE
@@ -475,6 +477,11 @@ config DDR
          information. This data is useful for drivers handling
          DDR SDRAM controllers.
 
+config IRQ_POLL
+       bool "IRQ polling library"
+       help
+         Helper library to poll interrupt mitigation using polling.
+
 config MPILIB
        tristate
        select CLZ_TAB
index 2d4bc33d09b42097085480d7ce93ec3a8882faba..a7c26a41a738bd1dee37a0654cc87e5c51d65653 100644 (file)
@@ -165,6 +165,7 @@ obj-$(CONFIG_GENERIC_NET_UTILS) += net_utils.o
 
 obj-$(CONFIG_SG_SPLIT) += sg_split.o
 obj-$(CONFIG_STMP_DEVICE) += stmp_device.o
+obj-$(CONFIG_IRQ_POLL) += irq_poll.o
 
 libfdt_files = fdt.o fdt_ro.o fdt_wip.o fdt_rw.o fdt_sw.o fdt_strerror.o \
               fdt_empty_tree.o
diff --git a/lib/irq_poll.c b/lib/irq_poll.c
new file mode 100644 (file)
index 0000000..836f7db
--- /dev/null
@@ -0,0 +1,222 @@
+/*
+ * Functions related to interrupt-poll handling in the block layer. This
+ * is similar to NAPI for network devices.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/bio.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+#include <linux/irq_poll.h>
+#include <linux/delay.h>
+
+static unsigned int irq_poll_budget __read_mostly = 256;
+
+static DEFINE_PER_CPU(struct list_head, blk_cpu_iopoll);
+
+/**
+ * irq_poll_sched - Schedule a run of the iopoll handler
+ * @iop:      The parent iopoll structure
+ *
+ * Description:
+ *     Add this irq_poll structure to the pending poll list and trigger the
+ *     raise of the blk iopoll softirq.
+ **/
+void irq_poll_sched(struct irq_poll *iop)
+{
+       unsigned long flags;
+
+       if (test_bit(IRQ_POLL_F_DISABLE, &iop->state))
+               return;
+       if (test_and_set_bit(IRQ_POLL_F_SCHED, &iop->state))
+               return;
+
+       local_irq_save(flags);
+       list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
+       __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
+       local_irq_restore(flags);
+}
+EXPORT_SYMBOL(irq_poll_sched);
+
+/**
+ * __irq_poll_complete - Mark this @iop as un-polled again
+ * @iop:      The parent iopoll structure
+ *
+ * Description:
+ *     See irq_poll_complete(). This function must be called with interrupts
+ *     disabled.
+ **/
+static void __irq_poll_complete(struct irq_poll *iop)
+{
+       list_del(&iop->list);
+       smp_mb__before_atomic();
+       clear_bit_unlock(IRQ_POLL_F_SCHED, &iop->state);
+}
+
+/**
+ * irq_poll_complete - Mark this @iop as un-polled again
+ * @iop:      The parent iopoll structure
+ *
+ * Description:
+ *     If a driver consumes less than the assigned budget in its run of the
+ *     iopoll handler, it'll end the polled mode by calling this function. The
+ *     iopoll handler will not be invoked again before irq_poll_sched()
+ *     is called.
+ **/
+void irq_poll_complete(struct irq_poll *iop)
+{
+       unsigned long flags;
+
+       local_irq_save(flags);
+       __irq_poll_complete(iop);
+       local_irq_restore(flags);
+}
+EXPORT_SYMBOL(irq_poll_complete);
+
+static void irq_poll_softirq(struct softirq_action *h)
+{
+       struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll);
+       int rearm = 0, budget = irq_poll_budget;
+       unsigned long start_time = jiffies;
+
+       local_irq_disable();
+
+       while (!list_empty(list)) {
+               struct irq_poll *iop;
+               int work, weight;
+
+               /*
+                * If softirq window is exhausted then punt.
+                */
+               if (budget <= 0 || time_after(jiffies, start_time)) {
+                       rearm = 1;
+                       break;
+               }
+
+               local_irq_enable();
+
+               /* Even though interrupts have been re-enabled, this
+                * access is safe because interrupts can only add new
+                * entries to the tail of this list, and only ->poll()
+                * calls can remove this head entry from the list.
+                */
+               iop = list_entry(list->next, struct irq_poll, list);
+
+               weight = iop->weight;
+               work = 0;
+               if (test_bit(IRQ_POLL_F_SCHED, &iop->state))
+                       work = iop->poll(iop, weight);
+
+               budget -= work;
+
+               local_irq_disable();
+
+               /*
+                * Drivers must not modify the iopoll state, if they
+                * consume their assigned weight (or more, some drivers can't
+                * easily just stop processing, they have to complete an
+                * entire mask of commands).In such cases this code
+                * still "owns" the iopoll instance and therefore can
+                * move the instance around on the list at-will.
+                */
+               if (work >= weight) {
+                       if (test_bit(IRQ_POLL_F_DISABLE, &iop->state))
+                               __irq_poll_complete(iop);
+                       else
+                               list_move_tail(&iop->list, list);
+               }
+       }
+
+       if (rearm)
+               __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
+
+       local_irq_enable();
+}
+
+/**
+ * irq_poll_disable - Disable iopoll on this @iop
+ * @iop:      The parent iopoll structure
+ *
+ * Description:
+ *     Disable io polling and wait for any pending callbacks to have completed.
+ **/
+void irq_poll_disable(struct irq_poll *iop)
+{
+       set_bit(IRQ_POLL_F_DISABLE, &iop->state);
+       while (test_and_set_bit(IRQ_POLL_F_SCHED, &iop->state))
+               msleep(1);
+       clear_bit(IRQ_POLL_F_DISABLE, &iop->state);
+}
+EXPORT_SYMBOL(irq_poll_disable);
+
+/**
+ * irq_poll_enable - Enable iopoll on this @iop
+ * @iop:      The parent iopoll structure
+ *
+ * Description:
+ *     Enable iopoll on this @iop. Note that the handler run will not be
+ *     scheduled, it will only mark it as active.
+ **/
+void irq_poll_enable(struct irq_poll *iop)
+{
+       BUG_ON(!test_bit(IRQ_POLL_F_SCHED, &iop->state));
+       smp_mb__before_atomic();
+       clear_bit_unlock(IRQ_POLL_F_SCHED, &iop->state);
+}
+EXPORT_SYMBOL(irq_poll_enable);
+
+/**
+ * irq_poll_init - Initialize this @iop
+ * @iop:      The parent iopoll structure
+ * @weight:   The default weight (or command completion budget)
+ * @poll_fn:  The handler to invoke
+ *
+ * Description:
+ *     Initialize and enable this irq_poll structure.
+ **/
+void irq_poll_init(struct irq_poll *iop, int weight, irq_poll_fn *poll_fn)
+{
+       memset(iop, 0, sizeof(*iop));
+       INIT_LIST_HEAD(&iop->list);
+       iop->weight = weight;
+       iop->poll = poll_fn;
+}
+EXPORT_SYMBOL(irq_poll_init);
+
+static int irq_poll_cpu_notify(struct notifier_block *self,
+                                unsigned long action, void *hcpu)
+{
+       /*
+        * If a CPU goes away, splice its entries to the current CPU
+        * and trigger a run of the softirq
+        */
+       if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+               int cpu = (unsigned long) hcpu;
+
+               local_irq_disable();
+               list_splice_init(&per_cpu(blk_cpu_iopoll, cpu),
+                                this_cpu_ptr(&blk_cpu_iopoll));
+               __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
+               local_irq_enable();
+       }
+
+       return NOTIFY_OK;
+}
+
+static struct notifier_block irq_poll_cpu_notifier = {
+       .notifier_call  = irq_poll_cpu_notify,
+};
+
+static __init int irq_poll_setup(void)
+{
+       int i;
+
+       for_each_possible_cpu(i)
+               INIT_LIST_HEAD(&per_cpu(blk_cpu_iopoll, i));
+
+       open_softirq(IRQ_POLL_SOFTIRQ, irq_poll_softirq);
+       register_hotcpu_notifier(&irq_poll_cpu_notifier);
+       return 0;
+}
+subsys_initcall(irq_poll_setup);
index 31ce853fbfb1be6a2b1905e593377520ab951eab..74a54b7f25626e8c6d224af2b7384f7dcbf2e72f 100644 (file)
@@ -75,3 +75,4 @@ module_exit(libcrc32c_mod_fini);
 MODULE_AUTHOR("Clay Haapala <chaapala@cisco.com>");
 MODULE_DESCRIPTION("CRC32c (Castagnoli) calculations");
 MODULE_LICENSE("GPL");
+MODULE_SOFTDEP("pre: crc32c");
index 028f5d996eef64aa960546010e8321b1b0c5a369..28ba40b99337e7098783b6693c55527ea1952ab3 100644 (file)
@@ -238,7 +238,7 @@ void lc_reset(struct lru_cache *lc)
  * @seq: the seq_file to print into
  * @lc: the lru cache to print statistics of
  */
-size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc)
+void lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc)
 {
        /* NOTE:
         * total calls to lc_get are
@@ -250,8 +250,6 @@ size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc)
        seq_printf(seq, "\t%s: used:%u/%u hits:%lu misses:%lu starving:%lu locked:%lu changed:%lu\n",
                   lc->name, lc->used, lc->nr_elements,
                   lc->hits, lc->misses, lc->starving, lc->locked, lc->changed);
-
-       return 0;
 }
 
 static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr)
index 40e03ea2a967d978cffae1cbeffe23d609ce01bc..2c5de86460c5bb82f3d1c83b136b2f985484298a 100644 (file)
@@ -49,7 +49,7 @@ int ___ratelimit(struct ratelimit_state *rs, const char *func)
                if (rs->missed)
                        printk(KERN_WARNING "%s: %d callbacks suppressed\n",
                                func, rs->missed);
-               rs->begin   = 0;
+               rs->begin   = jiffies;
                rs->printed = 0;
                rs->missed  = 0;
        }
index 847ee43c28068a0fb744fe124c1d8afa429d0719..bc943867d68c68dab4109fe715c8d37d6c72757a 100644 (file)
@@ -11,6 +11,7 @@
  */
 #include <linux/export.h>
 #include <linux/compiler.h>
+#include <linux/dax.h>
 #include <linux/fs.h>
 #include <linux/uaccess.h>
 #include <linux/capability.h>
@@ -123,9 +124,9 @@ static void page_cache_tree_delete(struct address_space *mapping,
        __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot);
 
        if (shadow) {
-               mapping->nrshadows++;
+               mapping->nrexceptional++;
                /*
-                * Make sure the nrshadows update is committed before
+                * Make sure the nrexceptional update is committed before
                 * the nrpages update so that final truncate racing
                 * with reclaim does not see both counters 0 at the
                 * same time and miss a shadow entry.
@@ -481,6 +482,12 @@ int filemap_write_and_wait_range(struct address_space *mapping,
 {
        int err = 0;
 
+       if (dax_mapping(mapping) && mapping->nrexceptional) {
+               err = dax_writeback_mapping_range(mapping, lstart, lend);
+               if (err)
+                       return err;
+       }
+
        if (mapping->nrpages) {
                err = __filemap_fdatawrite_range(mapping, lstart, lend,
                                                 WB_SYNC_ALL);
@@ -579,9 +586,13 @@ static int page_cache_tree_insert(struct address_space *mapping,
                p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
                if (!radix_tree_exceptional_entry(p))
                        return -EEXIST;
+
+               if (WARN_ON(dax_mapping(mapping)))
+                       return -EINVAL;
+
                if (shadowp)
                        *shadowp = p;
-               mapping->nrshadows--;
+               mapping->nrexceptional--;
                if (node)
                        workingset_node_shadows_dec(node);
        }
@@ -1245,9 +1256,9 @@ repeat:
                        if (radix_tree_deref_retry(page))
                                goto restart;
                        /*
-                        * A shadow entry of a recently evicted page,
-                        * or a swap entry from shmem/tmpfs.  Return
-                        * it without attempting to raise page count.
+                        * A shadow entry of a recently evicted page, a swap
+                        * entry from shmem/tmpfs or a DAX entry.  Return it
+                        * without attempting to raise page count.
                         */
                        goto export;
                }
@@ -1494,6 +1505,74 @@ repeat:
 }
 EXPORT_SYMBOL(find_get_pages_tag);
 
+/**
+ * find_get_entries_tag - find and return entries that match @tag
+ * @mapping:   the address_space to search
+ * @start:     the starting page cache index
+ * @tag:       the tag index
+ * @nr_entries:        the maximum number of entries
+ * @entries:   where the resulting entries are placed
+ * @indices:   the cache indices corresponding to the entries in @entries
+ *
+ * Like find_get_entries, except we only return entries which are tagged with
+ * @tag.
+ */
+unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
+                       int tag, unsigned int nr_entries,
+                       struct page **entries, pgoff_t *indices)
+{
+       void **slot;
+       unsigned int ret = 0;
+       struct radix_tree_iter iter;
+
+       if (!nr_entries)
+               return 0;
+
+       rcu_read_lock();
+restart:
+       radix_tree_for_each_tagged(slot, &mapping->page_tree,
+                                  &iter, start, tag) {
+               struct page *page;
+repeat:
+               page = radix_tree_deref_slot(slot);
+               if (unlikely(!page))
+                       continue;
+               if (radix_tree_exception(page)) {
+                       if (radix_tree_deref_retry(page)) {
+                               /*
+                                * Transient condition which can only trigger
+                                * when entry at index 0 moves out of or back
+                                * to root: none yet gotten, safe to restart.
+                                */
+                               goto restart;
+                       }
+
+                       /*
+                        * A shadow entry of a recently evicted page, a swap
+                        * entry from shmem/tmpfs or a DAX entry.  Return it
+                        * without attempting to raise page count.
+                        */
+                       goto export;
+               }
+               if (!page_cache_get_speculative(page))
+                       goto repeat;
+
+               /* Has the page moved? */
+               if (unlikely(page != *slot)) {
+                       page_cache_release(page);
+                       goto repeat;
+               }
+export:
+               indices[ret] = iter.index;
+               entries[ret] = page;
+               if (++ret == nr_entries)
+                       break;
+       }
+       rcu_read_unlock();
+       return ret;
+}
+EXPORT_SYMBOL(find_get_entries_tag);
+
 /*
  * CD/DVDs are error prone. When a medium error occurs, the driver may fail
  * a _large_ part of the i/o request. Imagine the worst scenario:
@@ -2684,11 +2763,11 @@ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
        struct inode *inode = file->f_mapping->host;
        ssize_t ret;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        ret = generic_write_checks(iocb, from);
        if (ret > 0)
                ret = __generic_file_write_iter(iocb, from);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 
        if (ret > 0) {
                ssize_t err;
index 8ad580273521046904ec13478e7b897839d955d7..fd3a07b3e6f4e086b2111c312a78512bed927f9b 100644 (file)
@@ -1560,7 +1560,8 @@ int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
        struct mm_struct *mm = tlb->mm;
        int ret = 0;
 
-       if (!pmd_trans_huge_lock(pmd, vma, &ptl))
+       ptl = pmd_trans_huge_lock(pmd, vma);
+       if (!ptl)
                goto out_unlocked;
 
        orig_pmd = *pmd;
@@ -1627,7 +1628,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
        pmd_t orig_pmd;
        spinlock_t *ptl;
 
-       if (!__pmd_trans_huge_lock(pmd, vma, &ptl))
+       ptl = __pmd_trans_huge_lock(pmd, vma);
+       if (!ptl)
                return 0;
        /*
         * For architectures like ppc64 we look at deposited pgtable
@@ -1690,7 +1692,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
         * We don't have to worry about the ordering of src and dst
         * ptlocks because exclusive mmap_sem prevents deadlock.
         */
-       if (__pmd_trans_huge_lock(old_pmd, vma, &old_ptl)) {
+       old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
+       if (old_ptl) {
                new_ptl = pmd_lockptr(mm, new_pmd);
                if (new_ptl != old_ptl)
                        spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -1724,7 +1727,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
        spinlock_t *ptl;
        int ret = 0;
 
-       if (__pmd_trans_huge_lock(pmd, vma, &ptl)) {
+       ptl = __pmd_trans_huge_lock(pmd, vma);
+       if (ptl) {
                pmd_t entry;
                bool preserve_write = prot_numa && pmd_write(*pmd);
                ret = 1;
@@ -1760,14 +1764,14 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
  * Note that if it returns true, this routine returns without unlocking page
  * table lock. So callers must unlock it.
  */
-bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
-               spinlock_t **ptl)
+spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
 {
-       *ptl = pmd_lock(vma->vm_mm, pmd);
+       spinlock_t *ptl;
+       ptl = pmd_lock(vma->vm_mm, pmd);
        if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
-               return true;
-       spin_unlock(*ptl);
-       return false;
+               return ptl;
+       spin_unlock(ptl);
+       return NULL;
 }
 
 #define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
@@ -2068,7 +2072,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
        if (likely(writable)) {
                if (likely(referenced)) {
                        result = SCAN_SUCCEED;
-                       trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero,
+                       trace_mm_collapse_huge_page_isolate(page, none_or_zero,
                                                            referenced, writable, result);
                        return 1;
                }
@@ -2078,7 +2082,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 
 out:
        release_pte_pages(pte, _pte);
-       trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero,
+       trace_mm_collapse_huge_page_isolate(page, none_or_zero,
                                            referenced, writable, result);
        return 0;
 }
@@ -2576,7 +2580,7 @@ out_unmap:
                collapse_huge_page(mm, address, hpage, vma, node);
        }
 out:
-       trace_mm_khugepaged_scan_pmd(mm, page_to_pfn(page), writable, referenced,
+       trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
                                     none_or_zero, result);
        return ret;
 }
index ca052f2a4a0b374eb9a2a6498b407be7da28ac48..d06cae2de783acf462162e27f8770ff7bf60f4ad 100644 (file)
@@ -4638,7 +4638,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
        pte_t *pte;
        spinlock_t *ptl;
 
-       if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
+       ptl = pmd_trans_huge_lock(pmd, vma);
+       if (ptl) {
                if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
                        mc.precharge += HPAGE_PMD_NR;
                spin_unlock(ptl);
@@ -4826,7 +4827,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
        union mc_target target;
        struct page *page;
 
-       if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
+       ptl = pmd_trans_huge_lock(pmd, vma);
+       if (ptl) {
                if (mc.precharge < HPAGE_PMD_NR) {
                        spin_unlock(ptl);
                        return 0;
index 2a565ed8bb4907398a0d3d2619fd4939df777970..563f320454902df04e782f73aa8d0473656034c9 100644 (file)
@@ -117,7 +117,8 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
        unsigned char *vec = walk->private;
        int nr = (end - addr) >> PAGE_SHIFT;
 
-       if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
+       ptl = pmd_trans_huge_lock(pmd, vma);
+       if (ptl) {
                memset(vec, 1, nr);
                spin_unlock(ptl);
                goto out;
index e1e2b1207bf2ee00604468d027b3879381e1b089..96f00104192861aec6358d9fae5f9b39fecb43a4 100644 (file)
@@ -175,7 +175,7 @@ static void __munlock_isolation_failed(struct page *page)
  */
 unsigned int munlock_vma_page(struct page *page)
 {
-       unsigned int nr_pages;
+       int nr_pages;
        struct zone *zone = page_zone(page);
 
        /* For try_to_munlock() and to serialize with page migration */
index 8a943b97a053a6aef9c939b6b06854fedea01b7b..998607adf6eb840a6ae30371484839277eaaab6b 100644 (file)
@@ -305,16 +305,12 @@ static void *pcpu_mem_zalloc(size_t size)
 /**
  * pcpu_mem_free - free memory
  * @ptr: memory to free
- * @size: size of the area
  *
  * Free @ptr.  @ptr should have been allocated using pcpu_mem_zalloc().
  */
-static void pcpu_mem_free(void *ptr, size_t size)
+static void pcpu_mem_free(void *ptr)
 {
-       if (size <= PAGE_SIZE)
-               kfree(ptr);
-       else
-               vfree(ptr);
+       kvfree(ptr);
 }
 
 /**
@@ -463,8 +459,8 @@ out_unlock:
         * pcpu_mem_free() might end up calling vfree() which uses
         * IRQ-unsafe lock and thus can't be called under pcpu_lock.
         */
-       pcpu_mem_free(old, old_size);
-       pcpu_mem_free(new, new_size);
+       pcpu_mem_free(old);
+       pcpu_mem_free(new);
 
        return 0;
 }
@@ -732,7 +728,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
        chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *
                                                sizeof(chunk->map[0]));
        if (!chunk->map) {
-               pcpu_mem_free(chunk, pcpu_chunk_struct_size);
+               pcpu_mem_free(chunk);
                return NULL;
        }
 
@@ -753,8 +749,8 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
 {
        if (!chunk)
                return;
-       pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
-       pcpu_mem_free(chunk, pcpu_chunk_struct_size);
+       pcpu_mem_free(chunk->map);
+       pcpu_mem_free(chunk);
 }
 
 /**
index fa2ceb2d2655dbd8e06ecd4e02384df5ad5b5891..440e2a7e6c1c2ca706376eb72d719fb8b83af05a 100644 (file)
@@ -701,8 +701,7 @@ static void shmem_evict_inode(struct inode *inode)
                        list_del_init(&info->swaplist);
                        mutex_unlock(&shmem_swaplist_mutex);
                }
-       } else
-               kfree(info->symlink);
+       }
 
        simple_xattrs_free(&info->xattrs);
        WARN_ON(inode->i_blocks);
@@ -1902,7 +1901,7 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
        if (whence != SEEK_DATA && whence != SEEK_HOLE)
                return generic_file_llseek_size(file, offset, whence,
                                        MAX_LFS_FILESIZE, i_size_read(inode));
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        /* We're holding i_mutex so we can access i_size directly */
 
        if (offset < 0)
@@ -1926,7 +1925,7 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
 
        if (offset >= 0)
                offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return offset;
 }
 
@@ -2091,7 +2090,7 @@ int shmem_add_seals(struct file *file, unsigned int seals)
        if (seals & ~(unsigned int)F_ALL_SEALS)
                return -EINVAL;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        if (info->seals & F_SEAL_SEAL) {
                error = -EPERM;
@@ -2114,7 +2113,7 @@ int shmem_add_seals(struct file *file, unsigned int seals)
        error = 0;
 
 unlock:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return error;
 }
 EXPORT_SYMBOL_GPL(shmem_add_seals);
@@ -2164,7 +2163,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
                return -EOPNOTSUPP;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        if (mode & FALLOC_FL_PUNCH_HOLE) {
                struct address_space *mapping = file->f_mapping;
@@ -2277,7 +2276,7 @@ undone:
        inode->i_private = NULL;
        spin_unlock(&inode->i_lock);
 out:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return error;
 }
 
@@ -2549,13 +2548,12 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
        info = SHMEM_I(inode);
        inode->i_size = len-1;
        if (len <= SHORT_SYMLINK_LEN) {
-               info->symlink = kmemdup(symname, len, GFP_KERNEL);
-               if (!info->symlink) {
+               inode->i_link = kmemdup(symname, len, GFP_KERNEL);
+               if (!inode->i_link) {
                        iput(inode);
                        return -ENOMEM;
                }
                inode->i_op = &shmem_short_symlink_operations;
-               inode->i_link = info->symlink;
        } else {
                inode_nohighmem(inode);
                error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
@@ -3132,6 +3130,7 @@ static struct inode *shmem_alloc_inode(struct super_block *sb)
 static void shmem_destroy_callback(struct rcu_head *head)
 {
        struct inode *inode = container_of(head, struct inode, i_rcu);
+       kfree(inode->i_link);
        kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
 }
 
index c43f654a7b645474592de879f0664e4698d9dfbd..d2c37365e2d6e429cf0f7cdbb62308f5e76aea93 100644 (file)
@@ -1956,9 +1956,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
                set_blocksize(bdev, old_block_size);
                blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
        } else {
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
                inode->i_flags &= ~S_SWAPFILE;
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
        }
        filp_close(swap_file, NULL);
 
@@ -2183,7 +2183,7 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
                p->flags |= SWP_BLKDEV;
        } else if (S_ISREG(inode->i_mode)) {
                p->bdev = inode->i_sb->s_bdev;
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
                if (IS_SWAPFILE(inode))
                        return -EBUSY;
        } else
@@ -2416,7 +2416,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        mapping = swap_file->f_mapping;
        inode = mapping->host;
 
-       /* If S_ISREG(inode->i_mode) will do mutex_lock(&inode->i_mutex); */
+       /* If S_ISREG(inode->i_mode) will do inode_lock(inode); */
        error = claim_swapfile(p, inode);
        if (unlikely(error))
                goto bad_swap;
@@ -2561,7 +2561,7 @@ bad_swap:
        vfree(cluster_info);
        if (swap_file) {
                if (inode && S_ISREG(inode->i_mode)) {
-                       mutex_unlock(&inode->i_mutex);
+                       inode_unlock(inode);
                        inode = NULL;
                }
                filp_close(swap_file, NULL);
@@ -2574,7 +2574,7 @@ out:
        if (name)
                putname(name);
        if (inode && S_ISREG(inode->i_mode))
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
        return error;
 }
 
index 76e35ad971025ce5eb3781543537d1bf3b947b8d..e3ee0e27cd17f20d1d35f1acdeab18136ccc1e29 100644 (file)
@@ -9,6 +9,7 @@
 
 #include <linux/kernel.h>
 #include <linux/backing-dev.h>
+#include <linux/dax.h>
 #include <linux/gfp.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
@@ -34,31 +35,39 @@ static void clear_exceptional_entry(struct address_space *mapping,
                return;
 
        spin_lock_irq(&mapping->tree_lock);
-       /*
-        * Regular page slots are stabilized by the page lock even
-        * without the tree itself locked.  These unlocked entries
-        * need verification under the tree lock.
-        */
-       if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot))
-               goto unlock;
-       if (*slot != entry)
-               goto unlock;
-       radix_tree_replace_slot(slot, NULL);
-       mapping->nrshadows--;
-       if (!node)
-               goto unlock;
-       workingset_node_shadows_dec(node);
-       /*
-        * Don't track node without shadow entries.
-        *
-        * Avoid acquiring the list_lru lock if already untracked.
-        * The list_empty() test is safe as node->private_list is
-        * protected by mapping->tree_lock.
-        */
-       if (!workingset_node_shadows(node) &&
-           !list_empty(&node->private_list))
-               list_lru_del(&workingset_shadow_nodes, &node->private_list);
-       __radix_tree_delete_node(&mapping->page_tree, node);
+
+       if (dax_mapping(mapping)) {
+               if (radix_tree_delete_item(&mapping->page_tree, index, entry))
+                       mapping->nrexceptional--;
+       } else {
+               /*
+                * Regular page slots are stabilized by the page lock even
+                * without the tree itself locked.  These unlocked entries
+                * need verification under the tree lock.
+                */
+               if (!__radix_tree_lookup(&mapping->page_tree, index, &node,
+                                       &slot))
+                       goto unlock;
+               if (*slot != entry)
+                       goto unlock;
+               radix_tree_replace_slot(slot, NULL);
+               mapping->nrexceptional--;
+               if (!node)
+                       goto unlock;
+               workingset_node_shadows_dec(node);
+               /*
+                * Don't track node without shadow entries.
+                *
+                * Avoid acquiring the list_lru lock if already untracked.
+                * The list_empty() test is safe as node->private_list is
+                * protected by mapping->tree_lock.
+                */
+               if (!workingset_node_shadows(node) &&
+                   !list_empty(&node->private_list))
+                       list_lru_del(&workingset_shadow_nodes,
+                                       &node->private_list);
+               __radix_tree_delete_node(&mapping->page_tree, node);
+       }
 unlock:
        spin_unlock_irq(&mapping->tree_lock);
 }
@@ -228,7 +237,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
        int             i;
 
        cleancache_invalidate_inode(mapping);
-       if (mapping->nrpages == 0 && mapping->nrshadows == 0)
+       if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
                return;
 
        /* Offsets within partial pages */
@@ -402,7 +411,7 @@ EXPORT_SYMBOL(truncate_inode_pages);
  */
 void truncate_inode_pages_final(struct address_space *mapping)
 {
-       unsigned long nrshadows;
+       unsigned long nrexceptional;
        unsigned long nrpages;
 
        /*
@@ -416,14 +425,14 @@ void truncate_inode_pages_final(struct address_space *mapping)
 
        /*
         * When reclaim installs eviction entries, it increases
-        * nrshadows first, then decreases nrpages.  Make sure we see
+        * nrexceptional first, then decreases nrpages.  Make sure we see
         * this in the right order or we might miss an entry.
         */
        nrpages = mapping->nrpages;
        smp_rmb();
-       nrshadows = mapping->nrshadows;
+       nrexceptional = mapping->nrexceptional;
 
-       if (nrpages || nrshadows) {
+       if (nrpages || nrexceptional) {
                /*
                 * As truncation uses a lockless tree lookup, cycle
                 * the tree lock to make sure any ongoing tree
index bd620b65db52680fc8a6e221fe90189c22dca2f1..eb3dd37ccd7c727dcc0b6030f62c183097b956bb 100644 (file)
@@ -46,6 +46,7 @@
 #include <linux/oom.h>
 #include <linux/prefetch.h>
 #include <linux/printk.h>
+#include <linux/dax.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -671,9 +672,15 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
                 * inode reclaim needs to empty out the radix tree or
                 * the nodes are lost.  Don't plant shadows behind its
                 * back.
+                *
+                * We also don't store shadows for DAX mappings because the
+                * only page cache pages found in these are zero pages
+                * covering holes, and because we don't want to mix DAX
+                * exceptional entries and shadow exceptional entries in the
+                * same page_tree.
                 */
                if (reclaimed && page_is_file_cache(page) &&
-                   !mapping_exiting(mapping))
+                   !mapping_exiting(mapping) && !dax_mapping(mapping))
                        shadow = workingset_eviction(mapping, page);
                __delete_from_page_cache(page, shadow, memcg);
                spin_unlock_irqrestore(&mapping->tree_lock, flags);
index 64bd0aa13f75cc25247609542f0949c18995c8c1..40b2c74ddf16d69abc52574c469d9f22dd49cf03 100644 (file)
@@ -1408,17 +1408,7 @@ static void vmstat_update(struct work_struct *w)
                 * Defer the checking for differentials to the
                 * shepherd thread on a different processor.
                 */
-               int r;
-               /*
-                * Shepherd work thread does not race since it never
-                * changes the bit if its zero but the cpu
-                * online / off line code may race if
-                * worker threads are still allowed during
-                * shutdown / startup.
-                */
-               r = cpumask_test_and_set_cpu(smp_processor_id(),
-                       cpu_stat_off);
-               VM_BUG_ON(r);
+               cpumask_set_cpu(smp_processor_id(), cpu_stat_off);
        }
 }
 
index aa017133744b227bed7592ea6cc32f360c3e142c..61ead9e5549df171f43fa53936a4b8cbd86a21a5 100644 (file)
@@ -351,8 +351,8 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
                        node->slots[i] = NULL;
                        BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT));
                        node->count -= 1U << RADIX_TREE_COUNT_SHIFT;
-                       BUG_ON(!mapping->nrshadows);
-                       mapping->nrshadows--;
+                       BUG_ON(!mapping->nrexceptional);
+                       mapping->nrexceptional--;
                }
        }
        BUG_ON(node->count);
index bced8c074c1280c6ff175576cb035bf048f18ddd..7bc2208b6cc4c445286c20e01d9911ac9e75b9b2 100644 (file)
@@ -108,9 +108,7 @@ struct p9_poll_wait {
  * @unsent_req_list: accounting for requests that haven't been sent
  * @req: current request being processed (if any)
  * @tmp_buf: temporary buffer to read in header
- * @rsize: amount to read for current frame
- * @rpos: read position in current frame
- * @rbuf: current read buffer
+ * @rc: temporary fcall for reading current frame
  * @wpos: write position for current frame
  * @wsize: amount of data to write for current frame
  * @wbuf: current write buffer
@@ -131,9 +129,7 @@ struct p9_conn {
        struct list_head unsent_req_list;
        struct p9_req_t *req;
        char tmp_buf[7];
-       int rsize;
-       int rpos;
-       char *rbuf;
+       struct p9_fcall rc;
        int wpos;
        int wsize;
        char *wbuf;
@@ -305,69 +301,77 @@ static void p9_read_work(struct work_struct *work)
        if (m->err < 0)
                return;
 
-       p9_debug(P9_DEBUG_TRANS, "start mux %p pos %d\n", m, m->rpos);
+       p9_debug(P9_DEBUG_TRANS, "start mux %p pos %zd\n", m, m->rc.offset);
 
-       if (!m->rbuf) {
-               m->rbuf = m->tmp_buf;
-               m->rpos = 0;
-               m->rsize = 7; /* start by reading header */
+       if (!m->rc.sdata) {
+               m->rc.sdata = m->tmp_buf;
+               m->rc.offset = 0;
+               m->rc.capacity = 7; /* start by reading header */
        }
 
        clear_bit(Rpending, &m->wsched);
-       p9_debug(P9_DEBUG_TRANS, "read mux %p pos %d size: %d = %d\n",
-                m, m->rpos, m->rsize, m->rsize-m->rpos);
-       err = p9_fd_read(m->client, m->rbuf + m->rpos,
-                                               m->rsize - m->rpos);
+       p9_debug(P9_DEBUG_TRANS, "read mux %p pos %zd size: %zd = %zd\n",
+                m, m->rc.offset, m->rc.capacity,
+                m->rc.capacity - m->rc.offset);
+       err = p9_fd_read(m->client, m->rc.sdata + m->rc.offset,
+                        m->rc.capacity - m->rc.offset);
        p9_debug(P9_DEBUG_TRANS, "mux %p got %d bytes\n", m, err);
-       if (err == -EAGAIN) {
+       if (err == -EAGAIN)
                goto end_clear;
-       }
 
        if (err <= 0)
                goto error;
 
-       m->rpos += err;
+       m->rc.offset += err;
 
-       if ((!m->req) && (m->rpos == m->rsize)) { /* header read in */
-               u16 tag;
+       /* header read in */
+       if ((!m->req) && (m->rc.offset == m->rc.capacity)) {
                p9_debug(P9_DEBUG_TRANS, "got new header\n");
 
-               n = le32_to_cpu(*(__le32 *) m->rbuf); /* read packet size */
-               if (n >= m->client->msize) {
+               err = p9_parse_header(&m->rc, NULL, NULL, NULL, 0);
+               if (err) {
+                       p9_debug(P9_DEBUG_ERROR,
+                                "error parsing header: %d\n", err);
+                       goto error;
+               }
+
+               if (m->rc.size >= m->client->msize) {
                        p9_debug(P9_DEBUG_ERROR,
-                                "requested packet size too big: %d\n", n);
+                                "requested packet size too big: %d\n",
+                                m->rc.size);
                        err = -EIO;
                        goto error;
                }
 
-               tag = le16_to_cpu(*(__le16 *) (m->rbuf+5)); /* read tag */
                p9_debug(P9_DEBUG_TRANS,
-                        "mux %p pkt: size: %d bytes tag: %d\n", m, n, tag);
+                        "mux %p pkt: size: %d bytes tag: %d\n",
+                        m, m->rc.size, m->rc.tag);
 
-               m->req = p9_tag_lookup(m->client, tag);
+               m->req = p9_tag_lookup(m->client, m->rc.tag);
                if (!m->req || (m->req->status != REQ_STATUS_SENT)) {
                        p9_debug(P9_DEBUG_ERROR, "Unexpected packet tag %d\n",
-                                tag);
+                                m->rc.tag);
                        err = -EIO;
                        goto error;
                }
 
                if (m->req->rc == NULL) {
-                       m->req->rc = kmalloc(sizeof(struct p9_fcall) +
-                                               m->client->msize, GFP_NOFS);
-                       if (!m->req->rc) {
-                               m->req = NULL;
-                               err = -ENOMEM;
-                               goto error;
-                       }
+                       p9_debug(P9_DEBUG_ERROR,
+                                "No recv fcall for tag %d (req %p), disconnecting!\n",
+                                m->rc.tag, m->req);
+                       m->req = NULL;
+                       err = -EIO;
+                       goto error;
                }
-               m->rbuf = (char *)m->req->rc + sizeof(struct p9_fcall);
-               memcpy(m->rbuf, m->tmp_buf, m->rsize);
-               m->rsize = n;
+               m->rc.sdata = (char *)m->req->rc + sizeof(struct p9_fcall);
+               memcpy(m->rc.sdata, m->tmp_buf, m->rc.capacity);
+               m->rc.capacity = m->rc.size;
        }
 
-       /* not an else because some packets (like clunk) have no payload */
-       if ((m->req) && (m->rpos == m->rsize)) { /* packet is read in */
+       /* packet is read in
+        * not an else because some packets (like clunk) have no payload
+        */
+       if ((m->req) && (m->rc.offset == m->rc.capacity)) {
                p9_debug(P9_DEBUG_TRANS, "got new packet\n");
                spin_lock(&m->client->lock);
                if (m->req->status != REQ_STATUS_ERROR)
@@ -375,9 +379,9 @@ static void p9_read_work(struct work_struct *work)
                list_del(&m->req->req_list);
                spin_unlock(&m->client->lock);
                p9_client_cb(m->client, m->req, status);
-               m->rbuf = NULL;
-               m->rpos = 0;
-               m->rsize = 0;
+               m->rc.sdata = NULL;
+               m->rc.offset = 0;
+               m->rc.capacity = 0;
                m->req = NULL;
        }
 
index 199bc76202d2552d23fe964f377bbf69eaf518b9..4acb1d5417aaf980bc7797c817eb9a9a350ecbf5 100644 (file)
@@ -658,7 +658,7 @@ p9_virtio_create(struct p9_client *client, const char *devname, char *args)
        mutex_unlock(&virtio_9p_lock);
 
        if (!found) {
-               pr_err("no channels available\n");
+               pr_err("no channels available for device %s\n", devname);
                return ret;
        }
 
index 10d87753ed8737329c244b1ae7718e95c8abd118..9e43a315e6622028ba2b61fd8d36ab20305d611b 100644 (file)
@@ -152,7 +152,6 @@ static int process_one_ticket(struct ceph_auth_client *ac,
        void *ticket_buf = NULL;
        void *tp, *tpend;
        void **ptp;
-       struct ceph_timespec new_validity;
        struct ceph_crypto_key new_session_key;
        struct ceph_buffer *new_ticket_blob;
        unsigned long new_expires, new_renew_after;
@@ -193,8 +192,8 @@ static int process_one_ticket(struct ceph_auth_client *ac,
        if (ret)
                goto out;
 
-       ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
-       ceph_decode_timespec(&validity, &new_validity);
+       ceph_decode_timespec(&validity, dp);
+       dp += sizeof(struct ceph_timespec);
        new_expires = get_seconds() + validity.tv_sec;
        new_renew_after = new_expires - (validity.tv_sec / 4);
        dout(" expires=%lu renew_after=%lu\n", new_expires,
@@ -233,10 +232,10 @@ static int process_one_ticket(struct ceph_auth_client *ac,
                ceph_buffer_put(th->ticket_blob);
        th->session_key = new_session_key;
        th->ticket_blob = new_ticket_blob;
-       th->validity = new_validity;
        th->secret_id = new_secret_id;
        th->expires = new_expires;
        th->renew_after = new_renew_after;
+       th->have_key = true;
        dout(" got ticket service %d (%s) secret_id %lld len %d\n",
             type, ceph_entity_type_name(type), th->secret_id,
             (int)th->ticket_blob->vec.iov_len);
@@ -384,6 +383,24 @@ bad:
        return -ERANGE;
 }
 
+static bool need_key(struct ceph_x_ticket_handler *th)
+{
+       if (!th->have_key)
+               return true;
+
+       return get_seconds() >= th->renew_after;
+}
+
+static bool have_key(struct ceph_x_ticket_handler *th)
+{
+       if (th->have_key) {
+               if (get_seconds() >= th->expires)
+                       th->have_key = false;
+       }
+
+       return th->have_key;
+}
+
 static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
 {
        int want = ac->want_keys;
@@ -402,20 +419,18 @@ static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
                        continue;
 
                th = get_ticket_handler(ac, service);
-
                if (IS_ERR(th)) {
                        *pneed |= service;
                        continue;
                }
 
-               if (get_seconds() >= th->renew_after)
+               if (need_key(th))
                        *pneed |= service;
-               if (get_seconds() >= th->expires)
+               if (!have_key(th))
                        xi->have_keys &= ~service;
        }
 }
 
-
 static int ceph_x_build_request(struct ceph_auth_client *ac,
                                void *buf, void *end)
 {
@@ -667,14 +682,26 @@ static void ceph_x_destroy(struct ceph_auth_client *ac)
        ac->private = NULL;
 }
 
-static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
-                                  int peer_type)
+static void invalidate_ticket(struct ceph_auth_client *ac, int peer_type)
 {
        struct ceph_x_ticket_handler *th;
 
        th = get_ticket_handler(ac, peer_type);
        if (!IS_ERR(th))
-               memset(&th->validity, 0, sizeof(th->validity));
+               th->have_key = false;
+}
+
+static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
+                                        int peer_type)
+{
+       /*
+        * We are to invalidate a service ticket in the hopes of
+        * getting a new, hopefully more valid, one.  But, we won't get
+        * it unless our AUTH ticket is good, so invalidate AUTH ticket
+        * as well, just in case.
+        */
+       invalidate_ticket(ac, peer_type);
+       invalidate_ticket(ac, CEPH_ENTITY_TYPE_AUTH);
 }
 
 static int calcu_signature(struct ceph_x_authorizer *au,
index e8b7c6917d472f69d356252415efcb76a4afd3cf..40b1a3cf7397352e4673becccdbbb083c88f04d8 100644 (file)
@@ -16,7 +16,7 @@ struct ceph_x_ticket_handler {
        unsigned int service;
 
        struct ceph_crypto_key session_key;
-       struct ceph_timespec validity;
+       bool have_key;
 
        u64 secret_id;
        struct ceph_buffer *ticket_blob;
index 9981039ef4ffcca29081b83a5e06a81ce6871f20..9cfedf565f5b236b5ede7974466cf62af6ad995c 100644 (file)
@@ -23,9 +23,6 @@
 #include <linux/ceph/pagelist.h>
 #include <linux/export.h>
 
-#define list_entry_next(pos, member)                                   \
-       list_entry(pos->member.next, typeof(*pos), member)
-
 /*
  * Ceph uses the messenger to exchange ceph_msg messages with other
  * hosts in the system.  The messenger provides ordered and reliable
@@ -672,6 +669,8 @@ static void reset_connection(struct ceph_connection *con)
        }
        con->in_seq = 0;
        con->in_seq_acked = 0;
+
+       con->out_skip = 0;
 }
 
 /*
@@ -771,6 +770,8 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
 
 static void con_out_kvec_reset(struct ceph_connection *con)
 {
+       BUG_ON(con->out_skip);
+
        con->out_kvec_left = 0;
        con->out_kvec_bytes = 0;
        con->out_kvec_cur = &con->out_kvec[0];
@@ -779,9 +780,9 @@ static void con_out_kvec_reset(struct ceph_connection *con)
 static void con_out_kvec_add(struct ceph_connection *con,
                                size_t size, void *data)
 {
-       int index;
+       int index = con->out_kvec_left;
 
-       index = con->out_kvec_left;
+       BUG_ON(con->out_skip);
        BUG_ON(index >= ARRAY_SIZE(con->out_kvec));
 
        con->out_kvec[index].iov_len = size;
@@ -790,6 +791,27 @@ static void con_out_kvec_add(struct ceph_connection *con,
        con->out_kvec_bytes += size;
 }
 
+/*
+ * Chop off a kvec from the end.  Return residual number of bytes for
+ * that kvec, i.e. how many bytes would have been written if the kvec
+ * hadn't been nuked.
+ */
+static int con_out_kvec_skip(struct ceph_connection *con)
+{
+       int off = con->out_kvec_cur - con->out_kvec;
+       int skip = 0;
+
+       if (con->out_kvec_bytes > 0) {
+               skip = con->out_kvec[off + con->out_kvec_left - 1].iov_len;
+               BUG_ON(con->out_kvec_bytes < skip);
+               BUG_ON(!con->out_kvec_left);
+               con->out_kvec_bytes -= skip;
+               con->out_kvec_left--;
+       }
+
+       return skip;
+}
+
 #ifdef CONFIG_BLOCK
 
 /*
@@ -1042,7 +1064,7 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor,
        /* Move on to the next page */
 
        BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head));
-       cursor->page = list_entry_next(cursor->page, lru);
+       cursor->page = list_next_entry(cursor->page, lru);
        cursor->last_piece = cursor->resid <= PAGE_SIZE;
 
        return true;
@@ -1166,7 +1188,7 @@ static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor,
        if (!cursor->resid && cursor->total_resid) {
                WARN_ON(!cursor->last_piece);
                BUG_ON(list_is_last(&cursor->data->links, cursor->data_head));
-               cursor->data = list_entry_next(cursor->data, links);
+               cursor->data = list_next_entry(cursor->data, links);
                __ceph_msg_data_cursor_init(cursor);
                new_piece = true;
        }
@@ -1197,7 +1219,6 @@ static void prepare_write_message_footer(struct ceph_connection *con)
        m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;
 
        dout("prepare_write_message_footer %p\n", con);
-       con->out_kvec_is_msg = true;
        con->out_kvec[v].iov_base = &m->footer;
        if (con->peer_features & CEPH_FEATURE_MSG_AUTH) {
                if (con->ops->sign_message)
@@ -1225,7 +1246,6 @@ static void prepare_write_message(struct ceph_connection *con)
        u32 crc;
 
        con_out_kvec_reset(con);
-       con->out_kvec_is_msg = true;
        con->out_msg_done = false;
 
        /* Sneak an ack in there first?  If we can get it into the same
@@ -1265,18 +1285,19 @@ static void prepare_write_message(struct ceph_connection *con)
 
        /* tag + hdr + front + middle */
        con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
-       con_out_kvec_add(con, sizeof (m->hdr), &m->hdr);
+       con_out_kvec_add(con, sizeof(con->out_hdr), &con->out_hdr);
        con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
 
        if (m->middle)
                con_out_kvec_add(con, m->middle->vec.iov_len,
                        m->middle->vec.iov_base);
 
-       /* fill in crc (except data pages), footer */
+       /* fill in hdr crc and finalize hdr */
        crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
        con->out_msg->hdr.crc = cpu_to_le32(crc);
-       con->out_msg->footer.flags = 0;
+       memcpy(&con->out_hdr, &con->out_msg->hdr, sizeof(con->out_hdr));
 
+       /* fill in front and middle crc, footer */
        crc = crc32c(0, m->front.iov_base, m->front.iov_len);
        con->out_msg->footer.front_crc = cpu_to_le32(crc);
        if (m->middle) {
@@ -1288,6 +1309,7 @@ static void prepare_write_message(struct ceph_connection *con)
        dout("%s front_crc %u middle_crc %u\n", __func__,
             le32_to_cpu(con->out_msg->footer.front_crc),
             le32_to_cpu(con->out_msg->footer.middle_crc));
+       con->out_msg->footer.flags = 0;
 
        /* is there a data payload? */
        con->out_msg->footer.data_crc = 0;
@@ -1492,7 +1514,6 @@ static int write_partial_kvec(struct ceph_connection *con)
                }
        }
        con->out_kvec_left = 0;
-       con->out_kvec_is_msg = false;
        ret = 1;
 out:
        dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
@@ -1584,6 +1605,7 @@ static int write_partial_skip(struct ceph_connection *con)
 {
        int ret;
 
+       dout("%s %p %d left\n", __func__, con, con->out_skip);
        while (con->out_skip > 0) {
                size_t size = min(con->out_skip, (int) PAGE_CACHE_SIZE);
 
@@ -2506,13 +2528,13 @@ more:
 
 more_kvec:
        /* kvec data queued? */
-       if (con->out_skip) {
-               ret = write_partial_skip(con);
+       if (con->out_kvec_left) {
+               ret = write_partial_kvec(con);
                if (ret <= 0)
                        goto out;
        }
-       if (con->out_kvec_left) {
-               ret = write_partial_kvec(con);
+       if (con->out_skip) {
+               ret = write_partial_skip(con);
                if (ret <= 0)
                        goto out;
        }
@@ -2805,13 +2827,17 @@ static bool con_backoff(struct ceph_connection *con)
 
 static void con_fault_finish(struct ceph_connection *con)
 {
+       dout("%s %p\n", __func__, con);
+
        /*
         * in case we faulted due to authentication, invalidate our
         * current tickets so that we can get new ones.
         */
-       if (con->auth_retry && con->ops->invalidate_authorizer) {
-               dout("calling invalidate_authorizer()\n");
-               con->ops->invalidate_authorizer(con);
+       if (con->auth_retry) {
+               dout("auth_retry %d, invalidating\n", con->auth_retry);
+               if (con->ops->invalidate_authorizer)
+                       con->ops->invalidate_authorizer(con);
+               con->auth_retry = 0;
        }
 
        if (con->ops->fault)
@@ -3050,16 +3076,31 @@ void ceph_msg_revoke(struct ceph_msg *msg)
                ceph_msg_put(msg);
        }
        if (con->out_msg == msg) {
-               dout("%s %p msg %p - was sending\n", __func__, con, msg);
-               con->out_msg = NULL;
-               if (con->out_kvec_is_msg) {
-                       con->out_skip = con->out_kvec_bytes;
-                       con->out_kvec_is_msg = false;
+               BUG_ON(con->out_skip);
+               /* footer */
+               if (con->out_msg_done) {
+                       con->out_skip += con_out_kvec_skip(con);
+               } else {
+                       BUG_ON(!msg->data_length);
+                       if (con->peer_features & CEPH_FEATURE_MSG_AUTH)
+                               con->out_skip += sizeof(msg->footer);
+                       else
+                               con->out_skip += sizeof(msg->old_footer);
                }
+               /* data, middle, front */
+               if (msg->data_length)
+                       con->out_skip += msg->cursor.total_resid;
+               if (msg->middle)
+                       con->out_skip += con_out_kvec_skip(con);
+               con->out_skip += con_out_kvec_skip(con);
+
+               dout("%s %p msg %p - was sending, will write %d skip %d\n",
+                    __func__, con, msg, con->out_kvec_bytes, con->out_skip);
                msg->hdr.seq = 0;
-
+               con->out_msg = NULL;
                ceph_msg_put(msg);
        }
+
        mutex_unlock(&con->mutex);
 }
 
@@ -3361,9 +3402,7 @@ static void ceph_msg_free(struct ceph_msg *m)
 static void ceph_msg_release(struct kref *kref)
 {
        struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
-       LIST_HEAD(data);
-       struct list_head *links;
-       struct list_head *next;
+       struct ceph_msg_data *data, *next;
 
        dout("%s %p\n", __func__, m);
        WARN_ON(!list_empty(&m->list_head));
@@ -3376,12 +3415,8 @@ static void ceph_msg_release(struct kref *kref)
                m->middle = NULL;
        }
 
-       list_splice_init(&m->data, &data);
-       list_for_each_safe(links, next, &data) {
-               struct ceph_msg_data *data;
-
-               data = list_entry(links, struct ceph_msg_data, links);
-               list_del_init(links);
+       list_for_each_entry_safe(data, next, &m->data, links) {
+               list_del_init(&data->links);
                ceph_msg_data_destroy(data);
        }
        m->data_length = 0;
index edda01626a459efbfdaeebd46fd6a0b13a037879..de85dddc3dc08cfeee81435c4475a6d975c4cd96 100644 (file)
@@ -364,10 +364,6 @@ static bool have_debugfs_info(struct ceph_mon_client *monc)
        return monc->client->have_fsid && monc->auth->global_id > 0;
 }
 
-/*
- * The monitor responds with mount ack indicate mount success.  The
- * included client ticket allows the client to talk to MDSs and OSDs.
- */
 static void ceph_monc_handle_map(struct ceph_mon_client *monc,
                                 struct ceph_msg *msg)
 {
index 744e5936c10d7ec555d1ca621f5bd4be57f1c72b..7aea0ccb6be6e463393c737103bf1ad467ee1d39 100644 (file)
@@ -289,10 +289,8 @@ static void __node_free_rcu(struct rcu_head *head)
 
        if (!n->tn_bits)
                kmem_cache_free(trie_leaf_kmem, n);
-       else if (n->tn_bits <= TNODE_KMALLOC_MAX)
-               kfree(n);
        else
-               vfree(n);
+               kvfree(n);
 }
 
 #define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu)
index f222885ac0c7397ed08c26106e68157d91267ff6..9481d55ff6cb2dd7a228964fa3d9dea154a5a9af 100644 (file)
@@ -122,44 +122,34 @@ void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
 static void rds_ib_add_one(struct ib_device *device)
 {
        struct rds_ib_device *rds_ibdev;
-       struct ib_device_attr *dev_attr;
 
        /* Only handle IB (no iWARP) devices */
        if (device->node_type != RDMA_NODE_IB_CA)
                return;
 
-       dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
-       if (!dev_attr)
-               return;
-
-       if (ib_query_device(device, dev_attr)) {
-               rdsdebug("Query device failed for %s\n", device->name);
-               goto free_attr;
-       }
-
        rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL,
                                 ibdev_to_node(device));
        if (!rds_ibdev)
-               goto free_attr;
+               return;
 
        spin_lock_init(&rds_ibdev->spinlock);
        atomic_set(&rds_ibdev->refcount, 1);
        INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free);
 
-       rds_ibdev->max_wrs = dev_attr->max_qp_wr;
-       rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
+       rds_ibdev->max_wrs = device->attrs.max_qp_wr;
+       rds_ibdev->max_sge = min(device->attrs.max_sge, RDS_IB_MAX_SGE);
 
-       rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32;
-       rds_ibdev->max_1m_fmrs = dev_attr->max_mr ?
-               min_t(unsigned int, (dev_attr->max_mr / 2),
+       rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32;
+       rds_ibdev->max_1m_fmrs = device->attrs.max_mr ?
+               min_t(unsigned int, (device->attrs.max_mr / 2),
                      rds_ib_fmr_1m_pool_size) : rds_ib_fmr_1m_pool_size;
 
-       rds_ibdev->max_8k_fmrs = dev_attr->max_mr ?
-               min_t(unsigned int, ((dev_attr->max_mr / 2) * RDS_MR_8K_SCALE),
+       rds_ibdev->max_8k_fmrs = device->attrs.max_mr ?
+               min_t(unsigned int, ((device->attrs.max_mr / 2) * RDS_MR_8K_SCALE),
                      rds_ib_fmr_8k_pool_size) : rds_ib_fmr_8k_pool_size;
 
-       rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom;
-       rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom;
+       rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom;
+       rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom;
 
        rds_ibdev->dev = device;
        rds_ibdev->pd = ib_alloc_pd(device);
@@ -183,7 +173,7 @@ static void rds_ib_add_one(struct ib_device *device)
        }
 
        rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_fmrs = %d, max_8k_fmrs = %d\n",
-                dev_attr->max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge,
+                device->attrs.max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge,
                 rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_fmrs,
                 rds_ibdev->max_8k_fmrs);
 
@@ -202,8 +192,6 @@ static void rds_ib_add_one(struct ib_device *device)
 
 put_dev:
        rds_ib_dev_put(rds_ibdev);
-free_attr:
-       kfree(dev_attr);
 }
 
 /*
index 576f1825fc55769f7242c75c771a50a6e8e5e93b..f4a9fff829e0e5193697ad334065dd423ab4fe0f 100644 (file)
@@ -60,30 +60,20 @@ LIST_HEAD(iw_nodev_conns);
 static void rds_iw_add_one(struct ib_device *device)
 {
        struct rds_iw_device *rds_iwdev;
-       struct ib_device_attr *dev_attr;
 
        /* Only handle iwarp devices */
        if (device->node_type != RDMA_NODE_RNIC)
                return;
 
-       dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
-       if (!dev_attr)
-               return;
-
-       if (ib_query_device(device, dev_attr)) {
-               rdsdebug("Query device failed for %s\n", device->name);
-               goto free_attr;
-       }
-
        rds_iwdev = kmalloc(sizeof *rds_iwdev, GFP_KERNEL);
        if (!rds_iwdev)
-               goto free_attr;
+               return;
 
        spin_lock_init(&rds_iwdev->spinlock);
 
-       rds_iwdev->dma_local_lkey = !!(dev_attr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY);
-       rds_iwdev->max_wrs = dev_attr->max_qp_wr;
-       rds_iwdev->max_sge = min(dev_attr->max_sge, RDS_IW_MAX_SGE);
+       rds_iwdev->dma_local_lkey = !!(device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY);
+       rds_iwdev->max_wrs = device->attrs.max_qp_wr;
+       rds_iwdev->max_sge = min(device->attrs.max_sge, RDS_IW_MAX_SGE);
 
        rds_iwdev->dev = device;
        rds_iwdev->pd = ib_alloc_pd(device);
@@ -111,8 +101,7 @@ static void rds_iw_add_one(struct ib_device *device)
        list_add_tail(&rds_iwdev->list, &rds_iw_devices);
 
        ib_set_client_data(device, &rds_iw_client, rds_iwdev);
-
-       goto free_attr;
+       return;
 
 err_mr:
        if (rds_iwdev->mr)
@@ -121,8 +110,6 @@ err_pd:
        ib_dealloc_pd(rds_iwdev->pd);
 free_dev:
        kfree(rds_iwdev);
-free_attr:
-       kfree(dev_attr);
 }
 
 static void rds_iw_remove_one(struct ib_device *device, void *client_data)
index 5e4f815c2b34d22fba11ac2443e9d66bc27c779c..2b32fd602669f1fbc1aa96d61ad076fe688713e1 100644 (file)
@@ -771,7 +771,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count,
        if (count == 0)
                return 0;
 
-       mutex_lock(&inode->i_mutex); /* protect against multiple concurrent
+       inode_lock(inode); /* protect against multiple concurrent
                              * readers on this file */
  again:
        spin_lock(&queue_lock);
@@ -784,7 +784,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count,
        }
        if (rp->q.list.next == &cd->queue) {
                spin_unlock(&queue_lock);
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
                WARN_ON_ONCE(rp->offset);
                return 0;
        }
@@ -838,7 +838,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count,
        }
        if (err == -EAGAIN)
                goto again;
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return err ? err :  count;
 }
 
@@ -909,9 +909,9 @@ static ssize_t cache_write(struct file *filp, const char __user *buf,
        if (!cd->cache_parse)
                goto out;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        ret = cache_downcall(mapping, buf, count, cd);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 out:
        return ret;
 }
index 14f45bf0410c688b25fafedfb6636cfa025ad06c..31789ef3e614484a4d5c75721d4b9f63fc735f6a 100644 (file)
@@ -172,7 +172,7 @@ rpc_close_pipes(struct inode *inode)
        int need_release;
        LIST_HEAD(free_list);
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        spin_lock(&pipe->lock);
        need_release = pipe->nreaders != 0 || pipe->nwriters != 0;
        pipe->nreaders = 0;
@@ -188,7 +188,7 @@ rpc_close_pipes(struct inode *inode)
        cancel_delayed_work_sync(&pipe->queue_timeout);
        rpc_inode_setowner(inode, NULL);
        RPC_I(inode)->pipe = NULL;
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 }
 
 static struct inode *
@@ -221,7 +221,7 @@ rpc_pipe_open(struct inode *inode, struct file *filp)
        int first_open;
        int res = -ENXIO;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        pipe = RPC_I(inode)->pipe;
        if (pipe == NULL)
                goto out;
@@ -237,7 +237,7 @@ rpc_pipe_open(struct inode *inode, struct file *filp)
                pipe->nwriters++;
        res = 0;
 out:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return res;
 }
 
@@ -248,7 +248,7 @@ rpc_pipe_release(struct inode *inode, struct file *filp)
        struct rpc_pipe_msg *msg;
        int last_close;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        pipe = RPC_I(inode)->pipe;
        if (pipe == NULL)
                goto out;
@@ -278,7 +278,7 @@ rpc_pipe_release(struct inode *inode, struct file *filp)
        if (last_close && pipe->ops->release_pipe)
                pipe->ops->release_pipe(inode);
 out:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return 0;
 }
 
@@ -290,7 +290,7 @@ rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset)
        struct rpc_pipe_msg *msg;
        int res = 0;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        pipe = RPC_I(inode)->pipe;
        if (pipe == NULL) {
                res = -EPIPE;
@@ -322,7 +322,7 @@ rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset)
                pipe->ops->destroy_msg(msg);
        }
 out_unlock:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return res;
 }
 
@@ -332,11 +332,11 @@ rpc_pipe_write(struct file *filp, const char __user *buf, size_t len, loff_t *of
        struct inode *inode = file_inode(filp);
        int res;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        res = -EPIPE;
        if (RPC_I(inode)->pipe != NULL)
                res = RPC_I(inode)->pipe->ops->downcall(filp, buf, len);
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return res;
 }
 
@@ -349,12 +349,12 @@ rpc_pipe_poll(struct file *filp, struct poll_table_struct *wait)
 
        poll_wait(filp, &rpci->waitq, wait);
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        if (rpci->pipe == NULL)
                mask |= POLLERR | POLLHUP;
        else if (filp->private_data || !list_empty(&rpci->pipe->pipe))
                mask |= POLLIN | POLLRDNORM;
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        return mask;
 }
 
@@ -367,10 +367,10 @@ rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 
        switch (cmd) {
        case FIONREAD:
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
                pipe = RPC_I(inode)->pipe;
                if (pipe == NULL) {
-                       mutex_unlock(&inode->i_mutex);
+                       inode_unlock(inode);
                        return -EPIPE;
                }
                spin_lock(&pipe->lock);
@@ -381,7 +381,7 @@ rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                        len += msg->len - msg->copied;
                }
                spin_unlock(&pipe->lock);
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
                return put_user(len, (int __user *)arg);
        default:
                return -EINVAL;
@@ -617,9 +617,9 @@ int rpc_rmdir(struct dentry *dentry)
 
        parent = dget_parent(dentry);
        dir = d_inode(parent);
-       mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+       inode_lock_nested(dir, I_MUTEX_PARENT);
        error = __rpc_rmdir(dir, dentry);
-       mutex_unlock(&dir->i_mutex);
+       inode_unlock(dir);
        dput(parent);
        return error;
 }
@@ -701,9 +701,9 @@ static void rpc_depopulate(struct dentry *parent,
 {
        struct inode *dir = d_inode(parent);
 
-       mutex_lock_nested(&dir->i_mutex, I_MUTEX_CHILD);
+       inode_lock_nested(dir, I_MUTEX_CHILD);
        __rpc_depopulate(parent, files, start, eof);
-       mutex_unlock(&dir->i_mutex);
+       inode_unlock(dir);
 }
 
 static int rpc_populate(struct dentry *parent,
@@ -715,7 +715,7 @@ static int rpc_populate(struct dentry *parent,
        struct dentry *dentry;
        int i, err;
 
-       mutex_lock(&dir->i_mutex);
+       inode_lock(dir);
        for (i = start; i < eof; i++) {
                dentry = __rpc_lookup_create_exclusive(parent, files[i].name);
                err = PTR_ERR(dentry);
@@ -739,11 +739,11 @@ static int rpc_populate(struct dentry *parent,
                if (err != 0)
                        goto out_bad;
        }
-       mutex_unlock(&dir->i_mutex);
+       inode_unlock(dir);
        return 0;
 out_bad:
        __rpc_depopulate(parent, files, start, eof);
-       mutex_unlock(&dir->i_mutex);
+       inode_unlock(dir);
        printk(KERN_WARNING "%s: %s failed to populate directory %pd\n",
                        __FILE__, __func__, parent);
        return err;
@@ -757,7 +757,7 @@ static struct dentry *rpc_mkdir_populate(struct dentry *parent,
        struct inode *dir = d_inode(parent);
        int error;
 
-       mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+       inode_lock_nested(dir, I_MUTEX_PARENT);
        dentry = __rpc_lookup_create_exclusive(parent, name);
        if (IS_ERR(dentry))
                goto out;
@@ -770,7 +770,7 @@ static struct dentry *rpc_mkdir_populate(struct dentry *parent,
                        goto err_rmdir;
        }
 out:
-       mutex_unlock(&dir->i_mutex);
+       inode_unlock(dir);
        return dentry;
 err_rmdir:
        __rpc_rmdir(dir, dentry);
@@ -788,11 +788,11 @@ static int rpc_rmdir_depopulate(struct dentry *dentry,
 
        parent = dget_parent(dentry);
        dir = d_inode(parent);
-       mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+       inode_lock_nested(dir, I_MUTEX_PARENT);
        if (depopulate != NULL)
                depopulate(dentry);
        error = __rpc_rmdir(dir, dentry);
-       mutex_unlock(&dir->i_mutex);
+       inode_unlock(dir);
        dput(parent);
        return error;
 }
@@ -828,7 +828,7 @@ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name,
        if (pipe->ops->downcall == NULL)
                umode &= ~S_IWUGO;
 
-       mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+       inode_lock_nested(dir, I_MUTEX_PARENT);
        dentry = __rpc_lookup_create_exclusive(parent, name);
        if (IS_ERR(dentry))
                goto out;
@@ -837,7 +837,7 @@ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name,
        if (err)
                goto out_err;
 out:
-       mutex_unlock(&dir->i_mutex);
+       inode_unlock(dir);
        return dentry;
 out_err:
        dentry = ERR_PTR(err);
@@ -865,9 +865,9 @@ rpc_unlink(struct dentry *dentry)
 
        parent = dget_parent(dentry);
        dir = d_inode(parent);
-       mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+       inode_lock_nested(dir, I_MUTEX_PARENT);
        error = __rpc_rmpipe(dir, dentry);
-       mutex_unlock(&dir->i_mutex);
+       inode_unlock(dir);
        dput(parent);
        return error;
 }
index 2e98f4a243e57d4539b3f4048d54c4b53f3c12d5..37edea6fa92d9685570ad3c1b37cb5fda5c0b9b3 100644 (file)
@@ -1425,3 +1425,4 @@ void xprt_put(struct rpc_xprt *xprt)
        if (atomic_dec_and_test(&xprt->count))
                xprt_destroy(xprt);
 }
+EXPORT_SYMBOL_GPL(xprt_put);
index 33f99d3004f2718206ad6b245a214e20718ac540..dc9f3b513a05f3a2fc9027359f9e1e587f9d243b 100644 (file)
@@ -2,7 +2,7 @@ obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o
 
 rpcrdma-y := transport.o rpc_rdma.o verbs.o \
        fmr_ops.o frwr_ops.o physical_ops.o \
-       svc_rdma.o svc_rdma_transport.o \
+       svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
        svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \
        module.o
 rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o
index c6836844bd0eb65ffa7134a6a014d60b5b42af1f..e16567389e28f5eba1359ddf91802d63c4612c26 100644 (file)
@@ -190,12 +190,11 @@ static int
 frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
             struct rpcrdma_create_data_internal *cdata)
 {
-       struct ib_device_attr *devattr = &ia->ri_devattr;
        int depth, delta;
 
        ia->ri_max_frmr_depth =
                        min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
-                             devattr->max_fast_reg_page_list_len);
+                             ia->ri_device->attrs.max_fast_reg_page_list_len);
        dprintk("RPC:       %s: device's max FR page list len = %u\n",
                __func__, ia->ri_max_frmr_depth);
 
@@ -222,8 +221,8 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
        }
 
        ep->rep_attr.cap.max_send_wr *= depth;
-       if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) {
-               cdata->max_requests = devattr->max_qp_wr / depth;
+       if (ep->rep_attr.cap.max_send_wr > ia->ri_device->attrs.max_qp_wr) {
+               cdata->max_requests = ia->ri_device->attrs.max_qp_wr / depth;
                if (!cdata->max_requests)
                        return -EINVAL;
                ep->rep_attr.cap.max_send_wr = cdata->max_requests *
index 1b7051bdbdc863dcd149b397bbc3f84a8872c5b1..c846ca9f1ebaa661f1e4a93893752950d35a4b6a 100644 (file)
@@ -55,6 +55,7 @@ unsigned int svcrdma_ord = RPCRDMA_ORD;
 static unsigned int min_ord = 1;
 static unsigned int max_ord = 4096;
 unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS;
+unsigned int svcrdma_max_bc_requests = RPCRDMA_MAX_BC_REQUESTS;
 static unsigned int min_max_requests = 4;
 static unsigned int max_max_requests = 16384;
 unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE;
@@ -71,10 +72,6 @@ atomic_t rdma_stat_rq_prod;
 atomic_t rdma_stat_sq_poll;
 atomic_t rdma_stat_sq_prod;
 
-/* Temporary NFS request map and context caches */
-struct kmem_cache *svc_rdma_map_cachep;
-struct kmem_cache *svc_rdma_ctxt_cachep;
-
 struct workqueue_struct *svc_rdma_wq;
 
 /*
@@ -243,17 +240,16 @@ void svc_rdma_cleanup(void)
        svc_unreg_xprt_class(&svc_rdma_bc_class);
 #endif
        svc_unreg_xprt_class(&svc_rdma_class);
-       kmem_cache_destroy(svc_rdma_map_cachep);
-       kmem_cache_destroy(svc_rdma_ctxt_cachep);
 }
 
 int svc_rdma_init(void)
 {
        dprintk("SVCRDMA Module Init, register RPC RDMA transport\n");
        dprintk("\tsvcrdma_ord      : %d\n", svcrdma_ord);
-       dprintk("\tmax_requests     : %d\n", svcrdma_max_requests);
-       dprintk("\tsq_depth         : %d\n",
+       dprintk("\tmax_requests     : %u\n", svcrdma_max_requests);
+       dprintk("\tsq_depth         : %u\n",
                svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT);
+       dprintk("\tmax_bc_requests  : %u\n", svcrdma_max_bc_requests);
        dprintk("\tmax_inline       : %d\n", svcrdma_max_req_size);
 
        svc_rdma_wq = alloc_workqueue("svc_rdma", 0, 0);
@@ -264,39 +260,10 @@ int svc_rdma_init(void)
                svcrdma_table_header =
                        register_sysctl_table(svcrdma_root_table);
 
-       /* Create the temporary map cache */
-       svc_rdma_map_cachep = kmem_cache_create("svc_rdma_map_cache",
-                                               sizeof(struct svc_rdma_req_map),
-                                               0,
-                                               SLAB_HWCACHE_ALIGN,
-                                               NULL);
-       if (!svc_rdma_map_cachep) {
-               printk(KERN_INFO "Could not allocate map cache.\n");
-               goto err0;
-       }
-
-       /* Create the temporary context cache */
-       svc_rdma_ctxt_cachep =
-               kmem_cache_create("svc_rdma_ctxt_cache",
-                                 sizeof(struct svc_rdma_op_ctxt),
-                                 0,
-                                 SLAB_HWCACHE_ALIGN,
-                                 NULL);
-       if (!svc_rdma_ctxt_cachep) {
-               printk(KERN_INFO "Could not allocate WR ctxt cache.\n");
-               goto err1;
-       }
-
        /* Register RDMA with the SVC transport switch */
        svc_reg_xprt_class(&svc_rdma_class);
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
        svc_reg_xprt_class(&svc_rdma_bc_class);
 #endif
        return 0;
- err1:
-       kmem_cache_destroy(svc_rdma_map_cachep);
- err0:
-       unregister_sysctl_table(svcrdma_table_header);
-       destroy_workqueue(svc_rdma_wq);
-       return -ENOMEM;
 }
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
new file mode 100644 (file)
index 0000000..65a7c23
--- /dev/null
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2015 Oracle.  All rights reserved.
+ *
+ * Support for backward direction RPCs on RPC/RDMA (server-side).
+ */
+
+#include <linux/sunrpc/svc_rdma.h>
+#include "xprt_rdma.h"
+
+#define RPCDBG_FACILITY        RPCDBG_SVCXPRT
+
+#undef SVCRDMA_BACKCHANNEL_DEBUG
+
+int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp,
+                            struct xdr_buf *rcvbuf)
+{
+       struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+       struct kvec *dst, *src = &rcvbuf->head[0];
+       struct rpc_rqst *req;
+       unsigned long cwnd;
+       u32 credits;
+       size_t len;
+       __be32 xid;
+       __be32 *p;
+       int ret;
+
+       p = (__be32 *)src->iov_base;
+       len = src->iov_len;
+       xid = rmsgp->rm_xid;
+
+#ifdef SVCRDMA_BACKCHANNEL_DEBUG
+       pr_info("%s: xid=%08x, length=%zu\n",
+               __func__, be32_to_cpu(xid), len);
+       pr_info("%s: RPC/RDMA: %*ph\n",
+               __func__, (int)RPCRDMA_HDRLEN_MIN, rmsgp);
+       pr_info("%s:      RPC: %*ph\n",
+               __func__, (int)len, p);
+#endif
+
+       ret = -EAGAIN;
+       if (src->iov_len < 24)
+               goto out_shortreply;
+
+       spin_lock_bh(&xprt->transport_lock);
+       req = xprt_lookup_rqst(xprt, xid);
+       if (!req)
+               goto out_notfound;
+
+       dst = &req->rq_private_buf.head[0];
+       memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(struct xdr_buf));
+       if (dst->iov_len < len)
+               goto out_unlock;
+       memcpy(dst->iov_base, p, len);
+
+       credits = be32_to_cpu(rmsgp->rm_credit);
+       if (credits == 0)
+               credits = 1;    /* don't deadlock */
+       else if (credits > r_xprt->rx_buf.rb_bc_max_requests)
+               credits = r_xprt->rx_buf.rb_bc_max_requests;
+
+       cwnd = xprt->cwnd;
+       xprt->cwnd = credits << RPC_CWNDSHIFT;
+       if (xprt->cwnd > cwnd)
+               xprt_release_rqst_cong(req->rq_task);
+
+       ret = 0;
+       xprt_complete_rqst(req->rq_task, rcvbuf->len);
+       rcvbuf->len = 0;
+
+out_unlock:
+       spin_unlock_bh(&xprt->transport_lock);
+out:
+       return ret;
+
+out_shortreply:
+       dprintk("svcrdma: short bc reply: xprt=%p, len=%zu\n",
+               xprt, src->iov_len);
+       goto out;
+
+out_notfound:
+       dprintk("svcrdma: unrecognized bc reply: xprt=%p, xid=%08x\n",
+               xprt, be32_to_cpu(xid));
+
+       goto out_unlock;
+}
+
+/* Send a backwards direction RPC call.
+ *
+ * Caller holds the connection's mutex and has already marshaled
+ * the RPC/RDMA request.
+ *
+ * This is similar to svc_rdma_reply, but takes an rpc_rqst
+ * instead, does not support chunks, and avoids blocking memory
+ * allocation.
+ *
+ * XXX: There is still an opportunity to block in svc_rdma_send()
+ * if there are no SQ entries to post the Send. This may occur if
+ * the adapter has a small maximum SQ depth.
+ */
+static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
+                             struct rpc_rqst *rqst)
+{
+       struct xdr_buf *sndbuf = &rqst->rq_snd_buf;
+       struct svc_rdma_op_ctxt *ctxt;
+       struct svc_rdma_req_map *vec;
+       struct ib_send_wr send_wr;
+       int ret;
+
+       vec = svc_rdma_get_req_map(rdma);
+       ret = svc_rdma_map_xdr(rdma, sndbuf, vec);
+       if (ret)
+               goto out_err;
+
+       /* Post a recv buffer to handle the reply for this request. */
+       ret = svc_rdma_post_recv(rdma, GFP_NOIO);
+       if (ret) {
+               pr_err("svcrdma: Failed to post bc receive buffer, err=%d.\n",
+                      ret);
+               pr_err("svcrdma: closing transport %p.\n", rdma);
+               set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+               ret = -ENOTCONN;
+               goto out_err;
+       }
+
+       ctxt = svc_rdma_get_context(rdma);
+       ctxt->pages[0] = virt_to_page(rqst->rq_buffer);
+       ctxt->count = 1;
+
+       ctxt->wr_op = IB_WR_SEND;
+       ctxt->direction = DMA_TO_DEVICE;
+       ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
+       ctxt->sge[0].length = sndbuf->len;
+       ctxt->sge[0].addr =
+           ib_dma_map_page(rdma->sc_cm_id->device, ctxt->pages[0], 0,
+                           sndbuf->len, DMA_TO_DEVICE);
+       if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) {
+               ret = -EIO;
+               goto out_unmap;
+       }
+       atomic_inc(&rdma->sc_dma_used);
+
+       memset(&send_wr, 0, sizeof(send_wr));
+       send_wr.wr_id = (unsigned long)ctxt;
+       send_wr.sg_list = ctxt->sge;
+       send_wr.num_sge = 1;
+       send_wr.opcode = IB_WR_SEND;
+       send_wr.send_flags = IB_SEND_SIGNALED;
+
+       ret = svc_rdma_send(rdma, &send_wr);
+       if (ret) {
+               ret = -EIO;
+               goto out_unmap;
+       }
+
+out_err:
+       svc_rdma_put_req_map(rdma, vec);
+       dprintk("svcrdma: %s returns %d\n", __func__, ret);
+       return ret;
+
+out_unmap:
+       svc_rdma_unmap_dma(ctxt);
+       svc_rdma_put_context(ctxt, 1);
+       goto out_err;
+}
+
+/* Server-side transport endpoint wants a whole page for its send
+ * buffer. The client RPC code constructs the RPC header in this
+ * buffer before it invokes ->send_request.
+ *
+ * Returns NULL if there was a temporary allocation failure.
+ */
+static void *
+xprt_rdma_bc_allocate(struct rpc_task *task, size_t size)
+{
+       struct rpc_rqst *rqst = task->tk_rqstp;
+       struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt;
+       struct svcxprt_rdma *rdma;
+       struct page *page;
+
+       rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt);
+
+       /* Prevent an infinite loop: try to make this case work */
+       if (size > PAGE_SIZE)
+               WARN_ONCE(1, "svcrdma: large bc buffer request (size %zu)\n",
+                         size);
+
+       page = alloc_page(RPCRDMA_DEF_GFP);
+       if (!page)
+               return NULL;
+
+       return page_address(page);
+}
+
+static void
+xprt_rdma_bc_free(void *buffer)
+{
+       /* No-op: ctxt and page have already been freed. */
+}
+
+static int
+rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst)
+{
+       struct rpc_xprt *xprt = rqst->rq_xprt;
+       struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+       struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)rqst->rq_buffer;
+       int rc;
+
+       /* Space in the send buffer for an RPC/RDMA header is reserved
+        * via xprt->tsh_size.
+        */
+       headerp->rm_xid = rqst->rq_xid;
+       headerp->rm_vers = rpcrdma_version;
+       headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests);
+       headerp->rm_type = rdma_msg;
+       headerp->rm_body.rm_chunks[0] = xdr_zero;
+       headerp->rm_body.rm_chunks[1] = xdr_zero;
+       headerp->rm_body.rm_chunks[2] = xdr_zero;
+
+#ifdef SVCRDMA_BACKCHANNEL_DEBUG
+       pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer);
+#endif
+
+       rc = svc_rdma_bc_sendto(rdma, rqst);
+       if (rc)
+               goto drop_connection;
+       return rc;
+
+drop_connection:
+       dprintk("svcrdma: failed to send bc call\n");
+       xprt_disconnect_done(xprt);
+       return -ENOTCONN;
+}
+
+/* Send an RPC call on the passive end of a transport
+ * connection.
+ */
+static int
+xprt_rdma_bc_send_request(struct rpc_task *task)
+{
+       struct rpc_rqst *rqst = task->tk_rqstp;
+       struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt;
+       struct svcxprt_rdma *rdma;
+       int ret;
+
+       dprintk("svcrdma: sending bc call with xid: %08x\n",
+               be32_to_cpu(rqst->rq_xid));
+
+       if (!mutex_trylock(&sxprt->xpt_mutex)) {
+               rpc_sleep_on(&sxprt->xpt_bc_pending, task, NULL);
+               if (!mutex_trylock(&sxprt->xpt_mutex))
+                       return -EAGAIN;
+               rpc_wake_up_queued_task(&sxprt->xpt_bc_pending, task);
+       }
+
+       ret = -ENOTCONN;
+       rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt);
+       if (!test_bit(XPT_DEAD, &sxprt->xpt_flags))
+               ret = rpcrdma_bc_send_request(rdma, rqst);
+
+       mutex_unlock(&sxprt->xpt_mutex);
+
+       if (ret < 0)
+               return ret;
+       return 0;
+}
+
+static void
+xprt_rdma_bc_close(struct rpc_xprt *xprt)
+{
+       dprintk("svcrdma: %s: xprt %p\n", __func__, xprt);
+}
+
+static void
+xprt_rdma_bc_put(struct rpc_xprt *xprt)
+{
+       dprintk("svcrdma: %s: xprt %p\n", __func__, xprt);
+
+       xprt_free(xprt);
+       module_put(THIS_MODULE);
+}
+
+static struct rpc_xprt_ops xprt_rdma_bc_procs = {
+       .reserve_xprt           = xprt_reserve_xprt_cong,
+       .release_xprt           = xprt_release_xprt_cong,
+       .alloc_slot             = xprt_alloc_slot,
+       .release_request        = xprt_release_rqst_cong,
+       .buf_alloc              = xprt_rdma_bc_allocate,
+       .buf_free               = xprt_rdma_bc_free,
+       .send_request           = xprt_rdma_bc_send_request,
+       .set_retrans_timeout    = xprt_set_retrans_timeout_def,
+       .close                  = xprt_rdma_bc_close,
+       .destroy                = xprt_rdma_bc_put,
+       .print_stats            = xprt_rdma_print_stats
+};
+
+static const struct rpc_timeout xprt_rdma_bc_timeout = {
+       .to_initval = 60 * HZ,
+       .to_maxval = 60 * HZ,
+};
+
+/* It shouldn't matter if the number of backchannel session slots
+ * doesn't match the number of RPC/RDMA credits. That just means
+ * one or the other will have extra slots that aren't used.
+ */
+static struct rpc_xprt *
+xprt_setup_rdma_bc(struct xprt_create *args)
+{
+       struct rpc_xprt *xprt;
+       struct rpcrdma_xprt *new_xprt;
+
+       if (args->addrlen > sizeof(xprt->addr)) {
+               dprintk("RPC:       %s: address too large\n", __func__);
+               return ERR_PTR(-EBADF);
+       }
+
+       xprt = xprt_alloc(args->net, sizeof(*new_xprt),
+                         RPCRDMA_MAX_BC_REQUESTS,
+                         RPCRDMA_MAX_BC_REQUESTS);
+       if (!xprt) {
+               dprintk("RPC:       %s: couldn't allocate rpc_xprt\n",
+                       __func__);
+               return ERR_PTR(-ENOMEM);
+       }
+
+       xprt->timeout = &xprt_rdma_bc_timeout;
+       xprt_set_bound(xprt);
+       xprt_set_connected(xprt);
+       xprt->bind_timeout = RPCRDMA_BIND_TO;
+       xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
+       xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO;
+
+       xprt->prot = XPRT_TRANSPORT_BC_RDMA;
+       xprt->tsh_size = RPCRDMA_HDRLEN_MIN / sizeof(__be32);
+       xprt->ops = &xprt_rdma_bc_procs;
+
+       memcpy(&xprt->addr, args->dstaddr, args->addrlen);
+       xprt->addrlen = args->addrlen;
+       xprt_rdma_format_addresses(xprt, (struct sockaddr *)&xprt->addr);
+       xprt->resvport = 0;
+
+       xprt->max_payload = xprt_rdma_max_inline_read;
+
+       new_xprt = rpcx_to_rdmax(xprt);
+       new_xprt->rx_buf.rb_bc_max_requests = xprt->max_reqs;
+
+       xprt_get(xprt);
+       args->bc_xprt->xpt_bc_xprt = xprt;
+       xprt->bc_xprt = args->bc_xprt;
+
+       if (!try_module_get(THIS_MODULE))
+               goto out_fail;
+
+       /* Final put for backchannel xprt is in __svc_rdma_free */
+       xprt_get(xprt);
+       return xprt;
+
+out_fail:
+       xprt_rdma_free_addresses(xprt);
+       args->bc_xprt->xpt_bc_xprt = NULL;
+       xprt_put(xprt);
+       xprt_free(xprt);
+       return ERR_PTR(-EINVAL);
+}
+
+struct xprt_class xprt_rdma_bc = {
+       .list                   = LIST_HEAD_INIT(xprt_rdma_bc.list),
+       .name                   = "rdma backchannel",
+       .owner                  = THIS_MODULE,
+       .ident                  = XPRT_TRANSPORT_BC_RDMA,
+       .setup                  = xprt_setup_rdma_bc,
+};
index ff4f01e527ecc08a1480ecba8f00d41a90a76571..c8b8a8b4181eafa75de0d673040b9f927fa74d9d 100644 (file)
@@ -144,6 +144,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
 
                head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no];
                head->arg.page_len += len;
+
                head->arg.len += len;
                if (!pg_off)
                        head->count++;
@@ -160,8 +161,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
                        goto err;
                atomic_inc(&xprt->sc_dma_used);
 
-               /* The lkey here is either a local dma lkey or a dma_mr lkey */
-               ctxt->sge[pno].lkey = xprt->sc_dma_lkey;
+               ctxt->sge[pno].lkey = xprt->sc_pd->local_dma_lkey;
                ctxt->sge[pno].length = len;
                ctxt->count++;
 
@@ -567,6 +567,38 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
        return ret;
 }
 
+/* By convention, backchannel calls arrive via rdma_msg type
+ * messages, and never populate the chunk lists. This makes
+ * the RPC/RDMA header small and fixed in size, so it is
+ * straightforward to check the RPC header's direction field.
+ */
+static bool
+svc_rdma_is_backchannel_reply(struct svc_xprt *xprt, struct rpcrdma_msg *rmsgp)
+{
+       __be32 *p = (__be32 *)rmsgp;
+
+       if (!xprt->xpt_bc_xprt)
+               return false;
+
+       if (rmsgp->rm_type != rdma_msg)
+               return false;
+       if (rmsgp->rm_body.rm_chunks[0] != xdr_zero)
+               return false;
+       if (rmsgp->rm_body.rm_chunks[1] != xdr_zero)
+               return false;
+       if (rmsgp->rm_body.rm_chunks[2] != xdr_zero)
+               return false;
+
+       /* sanity */
+       if (p[7] != rmsgp->rm_xid)
+               return false;
+       /* call direction */
+       if (p[8] == cpu_to_be32(RPC_CALL))
+               return false;
+
+       return true;
+}
+
 /*
  * Set up the rqstp thread context to point to the RQ buffer. If
  * necessary, pull additional data from the client with an RDMA_READ
@@ -632,6 +664,15 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
                goto close_out;
        }
 
+       if (svc_rdma_is_backchannel_reply(xprt, rmsgp)) {
+               ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, rmsgp,
+                                              &rqstp->rq_arg);
+               svc_rdma_put_context(ctxt, 0);
+               if (ret)
+                       goto repost;
+               return ret;
+       }
+
        /* Read read-list data. */
        ret = rdma_read_chunks(rdma_xprt, rmsgp, rqstp, ctxt);
        if (ret > 0) {
@@ -668,4 +709,15 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
        set_bit(XPT_CLOSE, &xprt->xpt_flags);
 defer:
        return 0;
+
+repost:
+       ret = svc_rdma_post_recv(rdma_xprt, GFP_KERNEL);
+       if (ret) {
+               pr_err("svcrdma: could not post a receive buffer, err=%d.\n",
+                      ret);
+               pr_err("svcrdma: closing transport %p.\n", rdma_xprt);
+               set_bit(XPT_CLOSE, &rdma_xprt->sc_xprt.xpt_flags);
+               ret = -ENOTCONN;
+       }
+       return ret;
 }
index 969a1ab75fc3c5fb8011157e4f57e8d08f560b42..df57f3ce6cd2cc3cae5fa2a709e62ceae2828a26 100644 (file)
@@ -50,9 +50,9 @@
 
 #define RPCDBG_FACILITY        RPCDBG_SVCXPRT
 
-static int map_xdr(struct svcxprt_rdma *xprt,
-                  struct xdr_buf *xdr,
-                  struct svc_rdma_req_map *vec)
+int svc_rdma_map_xdr(struct svcxprt_rdma *xprt,
+                    struct xdr_buf *xdr,
+                    struct svc_rdma_req_map *vec)
 {
        int sge_no;
        u32 sge_bytes;
@@ -62,7 +62,7 @@ static int map_xdr(struct svcxprt_rdma *xprt,
 
        if (xdr->len !=
            (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)) {
-               pr_err("svcrdma: map_xdr: XDR buffer length error\n");
+               pr_err("svcrdma: %s: XDR buffer length error\n", __func__);
                return -EIO;
        }
 
@@ -97,9 +97,9 @@ static int map_xdr(struct svcxprt_rdma *xprt,
                sge_no++;
        }
 
-       dprintk("svcrdma: map_xdr: sge_no %d page_no %d "
+       dprintk("svcrdma: %s: sge_no %d page_no %d "
                "page_base %u page_len %u head_len %zu tail_len %zu\n",
-               sge_no, page_no, xdr->page_base, xdr->page_len,
+               __func__, sge_no, page_no, xdr->page_base, xdr->page_len,
                xdr->head[0].iov_len, xdr->tail[0].iov_len);
 
        vec->count = sge_no;
@@ -265,7 +265,7 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
                                         sge[sge_no].addr))
                        goto err;
                atomic_inc(&xprt->sc_dma_used);
-               sge[sge_no].lkey = xprt->sc_dma_lkey;
+               sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey;
                ctxt->count++;
                sge_off = 0;
                sge_no++;
@@ -465,7 +465,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
        int ret;
 
        /* Post a recv buffer to handle another request. */
-       ret = svc_rdma_post_recv(rdma);
+       ret = svc_rdma_post_recv(rdma, GFP_KERNEL);
        if (ret) {
                printk(KERN_INFO
                       "svcrdma: could not post a receive buffer, err=%d."
@@ -480,7 +480,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
        ctxt->count = 1;
 
        /* Prepare the SGE for the RPCRDMA Header */
-       ctxt->sge[0].lkey = rdma->sc_dma_lkey;
+       ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
        ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp);
        ctxt->sge[0].addr =
            ib_dma_map_page(rdma->sc_cm_id->device, page, 0,
@@ -504,7 +504,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
                                         ctxt->sge[sge_no].addr))
                        goto err;
                atomic_inc(&rdma->sc_dma_used);
-               ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey;
+               ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
                ctxt->sge[sge_no].length = sge_bytes;
        }
        if (byte_count != 0) {
@@ -591,14 +591,17 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
        /* Build an req vec for the XDR */
        ctxt = svc_rdma_get_context(rdma);
        ctxt->direction = DMA_TO_DEVICE;
-       vec = svc_rdma_get_req_map();
-       ret = map_xdr(rdma, &rqstp->rq_res, vec);
+       vec = svc_rdma_get_req_map(rdma);
+       ret = svc_rdma_map_xdr(rdma, &rqstp->rq_res, vec);
        if (ret)
                goto err0;
        inline_bytes = rqstp->rq_res.len;
 
        /* Create the RDMA response header */
-       res_page = alloc_page(GFP_KERNEL | __GFP_NOFAIL);
+       ret = -ENOMEM;
+       res_page = alloc_page(GFP_KERNEL);
+       if (!res_page)
+               goto err0;
        rdma_resp = page_address(res_page);
        reply_ary = svc_rdma_get_reply_array(rdma_argp);
        if (reply_ary)
@@ -630,14 +633,14 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 
        ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, vec,
                         inline_bytes);
-       svc_rdma_put_req_map(vec);
+       svc_rdma_put_req_map(rdma, vec);
        dprintk("svcrdma: send_reply returns %d\n", ret);
        return ret;
 
  err1:
        put_page(res_page);
  err0:
-       svc_rdma_put_req_map(vec);
+       svc_rdma_put_req_map(rdma, vec);
        svc_rdma_put_context(ctxt, 0);
        return ret;
 }
index b348b4adef29a48246709cc7f32cf576865753eb..5763825d09bf776bfa89f0007be1805203d96adf 100644 (file)
@@ -153,18 +153,76 @@ static void svc_rdma_bc_free(struct svc_xprt *xprt)
 }
 #endif /* CONFIG_SUNRPC_BACKCHANNEL */
 
-struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
+static struct svc_rdma_op_ctxt *alloc_ctxt(struct svcxprt_rdma *xprt,
+                                          gfp_t flags)
 {
        struct svc_rdma_op_ctxt *ctxt;
 
-       ctxt = kmem_cache_alloc(svc_rdma_ctxt_cachep,
-                               GFP_KERNEL | __GFP_NOFAIL);
-       ctxt->xprt = xprt;
-       INIT_LIST_HEAD(&ctxt->dto_q);
+       ctxt = kmalloc(sizeof(*ctxt), flags);
+       if (ctxt) {
+               ctxt->xprt = xprt;
+               INIT_LIST_HEAD(&ctxt->free);
+               INIT_LIST_HEAD(&ctxt->dto_q);
+       }
+       return ctxt;
+}
+
+static bool svc_rdma_prealloc_ctxts(struct svcxprt_rdma *xprt)
+{
+       unsigned int i;
+
+       /* Each RPC/RDMA credit can consume a number of send
+        * and receive WQEs. One ctxt is allocated for each.
+        */
+       i = xprt->sc_sq_depth + xprt->sc_rq_depth;
+
+       while (i--) {
+               struct svc_rdma_op_ctxt *ctxt;
+
+               ctxt = alloc_ctxt(xprt, GFP_KERNEL);
+               if (!ctxt) {
+                       dprintk("svcrdma: No memory for RDMA ctxt\n");
+                       return false;
+               }
+               list_add(&ctxt->free, &xprt->sc_ctxts);
+       }
+       return true;
+}
+
+struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
+{
+       struct svc_rdma_op_ctxt *ctxt = NULL;
+
+       spin_lock_bh(&xprt->sc_ctxt_lock);
+       xprt->sc_ctxt_used++;
+       if (list_empty(&xprt->sc_ctxts))
+               goto out_empty;
+
+       ctxt = list_first_entry(&xprt->sc_ctxts,
+                               struct svc_rdma_op_ctxt, free);
+       list_del_init(&ctxt->free);
+       spin_unlock_bh(&xprt->sc_ctxt_lock);
+
+out:
        ctxt->count = 0;
        ctxt->frmr = NULL;
-       atomic_inc(&xprt->sc_ctxt_used);
        return ctxt;
+
+out_empty:
+       /* Either pre-allocation missed the mark, or send
+        * queue accounting is broken.
+        */
+       spin_unlock_bh(&xprt->sc_ctxt_lock);
+
+       ctxt = alloc_ctxt(xprt, GFP_NOIO);
+       if (ctxt)
+               goto out;
+
+       spin_lock_bh(&xprt->sc_ctxt_lock);
+       xprt->sc_ctxt_used--;
+       spin_unlock_bh(&xprt->sc_ctxt_lock);
+       WARN_ONCE(1, "svcrdma: empty RDMA ctxt list?\n");
+       return NULL;
 }
 
 void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
@@ -174,11 +232,11 @@ void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
        for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) {
                /*
                 * Unmap the DMA addr in the SGE if the lkey matches
-                * the sc_dma_lkey, otherwise, ignore it since it is
+                * the local_dma_lkey, otherwise, ignore it since it is
                 * an FRMR lkey and will be unmapped later when the
                 * last WR that uses it completes.
                 */
-               if (ctxt->sge[i].lkey == xprt->sc_dma_lkey) {
+               if (ctxt->sge[i].lkey == xprt->sc_pd->local_dma_lkey) {
                        atomic_dec(&xprt->sc_dma_used);
                        ib_dma_unmap_page(xprt->sc_cm_id->device,
                                            ctxt->sge[i].addr,
@@ -190,35 +248,108 @@ void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
 
 void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
 {
-       struct svcxprt_rdma *xprt;
+       struct svcxprt_rdma *xprt = ctxt->xprt;
        int i;
 
-       xprt = ctxt->xprt;
        if (free_pages)
                for (i = 0; i < ctxt->count; i++)
                        put_page(ctxt->pages[i]);
 
-       kmem_cache_free(svc_rdma_ctxt_cachep, ctxt);
-       atomic_dec(&xprt->sc_ctxt_used);
+       spin_lock_bh(&xprt->sc_ctxt_lock);
+       xprt->sc_ctxt_used--;
+       list_add(&ctxt->free, &xprt->sc_ctxts);
+       spin_unlock_bh(&xprt->sc_ctxt_lock);
 }
 
-/*
- * Temporary NFS req mappings are shared across all transport
- * instances. These are short lived and should be bounded by the number
- * of concurrent server threads * depth of the SQ.
- */
-struct svc_rdma_req_map *svc_rdma_get_req_map(void)
+static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt)
+{
+       while (!list_empty(&xprt->sc_ctxts)) {
+               struct svc_rdma_op_ctxt *ctxt;
+
+               ctxt = list_first_entry(&xprt->sc_ctxts,
+                                       struct svc_rdma_op_ctxt, free);
+               list_del(&ctxt->free);
+               kfree(ctxt);
+       }
+}
+
+static struct svc_rdma_req_map *alloc_req_map(gfp_t flags)
 {
        struct svc_rdma_req_map *map;
-       map = kmem_cache_alloc(svc_rdma_map_cachep,
-                              GFP_KERNEL | __GFP_NOFAIL);
+
+       map = kmalloc(sizeof(*map), flags);
+       if (map)
+               INIT_LIST_HEAD(&map->free);
+       return map;
+}
+
+static bool svc_rdma_prealloc_maps(struct svcxprt_rdma *xprt)
+{
+       unsigned int i;
+
+       /* One for each receive buffer on this connection. */
+       i = xprt->sc_max_requests;
+
+       while (i--) {
+               struct svc_rdma_req_map *map;
+
+               map = alloc_req_map(GFP_KERNEL);
+               if (!map) {
+                       dprintk("svcrdma: No memory for request map\n");
+                       return false;
+               }
+               list_add(&map->free, &xprt->sc_maps);
+       }
+       return true;
+}
+
+struct svc_rdma_req_map *svc_rdma_get_req_map(struct svcxprt_rdma *xprt)
+{
+       struct svc_rdma_req_map *map = NULL;
+
+       spin_lock(&xprt->sc_map_lock);
+       if (list_empty(&xprt->sc_maps))
+               goto out_empty;
+
+       map = list_first_entry(&xprt->sc_maps,
+                              struct svc_rdma_req_map, free);
+       list_del_init(&map->free);
+       spin_unlock(&xprt->sc_map_lock);
+
+out:
        map->count = 0;
        return map;
+
+out_empty:
+       spin_unlock(&xprt->sc_map_lock);
+
+       /* Pre-allocation amount was incorrect */
+       map = alloc_req_map(GFP_NOIO);
+       if (map)
+               goto out;
+
+       WARN_ONCE(1, "svcrdma: empty request map list?\n");
+       return NULL;
+}
+
+void svc_rdma_put_req_map(struct svcxprt_rdma *xprt,
+                         struct svc_rdma_req_map *map)
+{
+       spin_lock(&xprt->sc_map_lock);
+       list_add(&map->free, &xprt->sc_maps);
+       spin_unlock(&xprt->sc_map_lock);
 }
 
-void svc_rdma_put_req_map(struct svc_rdma_req_map *map)
+static void svc_rdma_destroy_maps(struct svcxprt_rdma *xprt)
 {
-       kmem_cache_free(svc_rdma_map_cachep, map);
+       while (!list_empty(&xprt->sc_maps)) {
+               struct svc_rdma_req_map *map;
+
+               map = list_first_entry(&xprt->sc_maps,
+                                      struct svc_rdma_req_map, free);
+               list_del(&map->free);
+               kfree(map);
+       }
 }
 
 /* ib_cq event handler */
@@ -386,46 +517,44 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt)
 static void process_context(struct svcxprt_rdma *xprt,
                            struct svc_rdma_op_ctxt *ctxt)
 {
+       struct svc_rdma_op_ctxt *read_hdr;
+       int free_pages = 0;
+
        svc_rdma_unmap_dma(ctxt);
 
        switch (ctxt->wr_op) {
        case IB_WR_SEND:
-               if (ctxt->frmr)
-                       pr_err("svcrdma: SEND: ctxt->frmr != NULL\n");
-               svc_rdma_put_context(ctxt, 1);
+               free_pages = 1;
                break;
 
        case IB_WR_RDMA_WRITE:
-               if (ctxt->frmr)
-                       pr_err("svcrdma: WRITE: ctxt->frmr != NULL\n");
-               svc_rdma_put_context(ctxt, 0);
                break;
 
        case IB_WR_RDMA_READ:
        case IB_WR_RDMA_READ_WITH_INV:
                svc_rdma_put_frmr(xprt, ctxt->frmr);
-               if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
-                       struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr;
-                       if (read_hdr) {
-                               spin_lock_bh(&xprt->sc_rq_dto_lock);
-                               set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
-                               list_add_tail(&read_hdr->dto_q,
-                                             &xprt->sc_read_complete_q);
-                               spin_unlock_bh(&xprt->sc_rq_dto_lock);
-                       } else {
-                               pr_err("svcrdma: ctxt->read_hdr == NULL\n");
-                       }
-                       svc_xprt_enqueue(&xprt->sc_xprt);
-               }
+
+               if (!test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags))
+                       break;
+
+               read_hdr = ctxt->read_hdr;
                svc_rdma_put_context(ctxt, 0);
-               break;
+
+               spin_lock_bh(&xprt->sc_rq_dto_lock);
+               set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
+               list_add_tail(&read_hdr->dto_q,
+                             &xprt->sc_read_complete_q);
+               spin_unlock_bh(&xprt->sc_rq_dto_lock);
+               svc_xprt_enqueue(&xprt->sc_xprt);
+               return;
 
        default:
-               printk(KERN_ERR "svcrdma: unexpected completion type, "
-                      "opcode=%d\n",
-                      ctxt->wr_op);
+               dprintk("svcrdma: unexpected completion opcode=%d\n",
+                       ctxt->wr_op);
                break;
        }
+
+       svc_rdma_put_context(ctxt, free_pages);
 }
 
 /*
@@ -523,19 +652,15 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
        INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
        INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
        INIT_LIST_HEAD(&cma_xprt->sc_frmr_q);
+       INIT_LIST_HEAD(&cma_xprt->sc_ctxts);
+       INIT_LIST_HEAD(&cma_xprt->sc_maps);
        init_waitqueue_head(&cma_xprt->sc_send_wait);
 
        spin_lock_init(&cma_xprt->sc_lock);
        spin_lock_init(&cma_xprt->sc_rq_dto_lock);
        spin_lock_init(&cma_xprt->sc_frmr_q_lock);
-
-       cma_xprt->sc_ord = svcrdma_ord;
-
-       cma_xprt->sc_max_req_size = svcrdma_max_req_size;
-       cma_xprt->sc_max_requests = svcrdma_max_requests;
-       cma_xprt->sc_sq_depth = svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT;
-       atomic_set(&cma_xprt->sc_sq_count, 0);
-       atomic_set(&cma_xprt->sc_ctxt_used, 0);
+       spin_lock_init(&cma_xprt->sc_ctxt_lock);
+       spin_lock_init(&cma_xprt->sc_map_lock);
 
        if (listener)
                set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
@@ -543,7 +668,7 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
        return cma_xprt;
 }
 
-int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
+int svc_rdma_post_recv(struct svcxprt_rdma *xprt, gfp_t flags)
 {
        struct ib_recv_wr recv_wr, *bad_recv_wr;
        struct svc_rdma_op_ctxt *ctxt;
@@ -561,7 +686,9 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
                        pr_err("svcrdma: Too many sges (%d)\n", sge_no);
                        goto err_put_ctxt;
                }
-               page = alloc_page(GFP_KERNEL | __GFP_NOFAIL);
+               page = alloc_page(flags);
+               if (!page)
+                       goto err_put_ctxt;
                ctxt->pages[sge_no] = page;
                pa = ib_dma_map_page(xprt->sc_cm_id->device,
                                     page, 0, PAGE_SIZE,
@@ -571,7 +698,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
                atomic_inc(&xprt->sc_dma_used);
                ctxt->sge[sge_no].addr = pa;
                ctxt->sge[sge_no].length = PAGE_SIZE;
-               ctxt->sge[sge_no].lkey = xprt->sc_dma_lkey;
+               ctxt->sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey;
                ctxt->count = sge_no + 1;
                buflen += PAGE_SIZE;
        }
@@ -886,11 +1013,9 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
        struct rdma_conn_param conn_param;
        struct ib_cq_init_attr cq_attr = {};
        struct ib_qp_init_attr qp_attr;
-       struct ib_device_attr devattr;
-       int uninitialized_var(dma_mr_acc);
-       int need_dma_mr = 0;
-       int ret;
-       int i;
+       struct ib_device *dev;
+       unsigned int i;
+       int ret = 0;
 
        listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt);
        clear_bit(XPT_CONN, &xprt->xpt_flags);
@@ -910,37 +1035,42 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
        dprintk("svcrdma: newxprt from accept queue = %p, cm_id=%p\n",
                newxprt, newxprt->sc_cm_id);
 
-       ret = ib_query_device(newxprt->sc_cm_id->device, &devattr);
-       if (ret) {
-               dprintk("svcrdma: could not query device attributes on "
-                       "device %p, rc=%d\n", newxprt->sc_cm_id->device, ret);
-               goto errout;
-       }
+       dev = newxprt->sc_cm_id->device;
 
        /* Qualify the transport resource defaults with the
         * capabilities of this particular device */
-       newxprt->sc_max_sge = min((size_t)devattr.max_sge,
+       newxprt->sc_max_sge = min((size_t)dev->attrs.max_sge,
                                  (size_t)RPCSVC_MAXPAGES);
-       newxprt->sc_max_sge_rd = min_t(size_t, devattr.max_sge_rd,
+       newxprt->sc_max_sge_rd = min_t(size_t, dev->attrs.max_sge_rd,
                                       RPCSVC_MAXPAGES);
-       newxprt->sc_max_requests = min((size_t)devattr.max_qp_wr,
-                                  (size_t)svcrdma_max_requests);
-       newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests;
+       newxprt->sc_max_req_size = svcrdma_max_req_size;
+       newxprt->sc_max_requests = min_t(u32, dev->attrs.max_qp_wr,
+                                        svcrdma_max_requests);
+       newxprt->sc_max_bc_requests = min_t(u32, dev->attrs.max_qp_wr,
+                                           svcrdma_max_bc_requests);
+       newxprt->sc_rq_depth = newxprt->sc_max_requests +
+                              newxprt->sc_max_bc_requests;
+       newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_rq_depth;
+
+       if (!svc_rdma_prealloc_ctxts(newxprt))
+               goto errout;
+       if (!svc_rdma_prealloc_maps(newxprt))
+               goto errout;
 
        /*
         * Limit ORD based on client limit, local device limit, and
         * configured svcrdma limit.
         */
-       newxprt->sc_ord = min_t(size_t, devattr.max_qp_rd_atom, newxprt->sc_ord);
+       newxprt->sc_ord = min_t(size_t, dev->attrs.max_qp_rd_atom, newxprt->sc_ord);
        newxprt->sc_ord = min_t(size_t, svcrdma_ord, newxprt->sc_ord);
 
-       newxprt->sc_pd = ib_alloc_pd(newxprt->sc_cm_id->device);
+       newxprt->sc_pd = ib_alloc_pd(dev);
        if (IS_ERR(newxprt->sc_pd)) {
                dprintk("svcrdma: error creating PD for connect request\n");
                goto errout;
        }
        cq_attr.cqe = newxprt->sc_sq_depth;
-       newxprt->sc_sq_cq = ib_create_cq(newxprt->sc_cm_id->device,
+       newxprt->sc_sq_cq = ib_create_cq(dev,
                                         sq_comp_handler,
                                         cq_event_handler,
                                         newxprt,
@@ -949,8 +1079,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
                dprintk("svcrdma: error creating SQ CQ for connect request\n");
                goto errout;
        }
-       cq_attr.cqe = newxprt->sc_max_requests;
-       newxprt->sc_rq_cq = ib_create_cq(newxprt->sc_cm_id->device,
+       cq_attr.cqe = newxprt->sc_rq_depth;
+       newxprt->sc_rq_cq = ib_create_cq(dev,
                                         rq_comp_handler,
                                         cq_event_handler,
                                         newxprt,
@@ -964,7 +1094,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
        qp_attr.event_handler = qp_event_handler;
        qp_attr.qp_context = &newxprt->sc_xprt;
        qp_attr.cap.max_send_wr = newxprt->sc_sq_depth;
-       qp_attr.cap.max_recv_wr = newxprt->sc_max_requests;
+       qp_attr.cap.max_recv_wr = newxprt->sc_rq_depth;
        qp_attr.cap.max_send_sge = newxprt->sc_max_sge;
        qp_attr.cap.max_recv_sge = newxprt->sc_max_sge;
        qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
@@ -978,7 +1108,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
                "    cap.max_send_sge = %d\n"
                "    cap.max_recv_sge = %d\n",
                newxprt->sc_cm_id, newxprt->sc_pd,
-               newxprt->sc_cm_id->device, newxprt->sc_pd->device,
+               dev, newxprt->sc_pd->device,
                qp_attr.cap.max_send_wr,
                qp_attr.cap.max_recv_wr,
                qp_attr.cap.max_send_sge,
@@ -1014,9 +1144,9 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
         *      of an RDMA_READ. IB does not.
         */
        newxprt->sc_reader = rdma_read_chunk_lcl;
-       if (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
+       if (dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
                newxprt->sc_frmr_pg_list_len =
-                       devattr.max_fast_reg_page_list_len;
+                       dev->attrs.max_fast_reg_page_list_len;
                newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG;
                newxprt->sc_reader = rdma_read_chunk_frmr;
        }
@@ -1024,44 +1154,16 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
        /*
         * Determine if a DMA MR is required and if so, what privs are required
         */
-       if (!rdma_protocol_iwarp(newxprt->sc_cm_id->device,
-                                newxprt->sc_cm_id->port_num) &&
-           !rdma_ib_or_roce(newxprt->sc_cm_id->device,
-                            newxprt->sc_cm_id->port_num))
+       if (!rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num) &&
+           !rdma_ib_or_roce(dev, newxprt->sc_cm_id->port_num))
                goto errout;
 
-       if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG) ||
-           !(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) {
-               need_dma_mr = 1;
-               dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
-               if (rdma_protocol_iwarp(newxprt->sc_cm_id->device,
-                                       newxprt->sc_cm_id->port_num) &&
-                   !(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG))
-                       dma_mr_acc |= IB_ACCESS_REMOTE_WRITE;
-       }
-
-       if (rdma_protocol_iwarp(newxprt->sc_cm_id->device,
-                               newxprt->sc_cm_id->port_num))
+       if (rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num))
                newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV;
 
-       /* Create the DMA MR if needed, otherwise, use the DMA LKEY */
-       if (need_dma_mr) {
-               /* Register all of physical memory */
-               newxprt->sc_phys_mr =
-                       ib_get_dma_mr(newxprt->sc_pd, dma_mr_acc);
-               if (IS_ERR(newxprt->sc_phys_mr)) {
-                       dprintk("svcrdma: Failed to create DMA MR ret=%d\n",
-                               ret);
-                       goto errout;
-               }
-               newxprt->sc_dma_lkey = newxprt->sc_phys_mr->lkey;
-       } else
-               newxprt->sc_dma_lkey =
-                       newxprt->sc_cm_id->device->local_dma_lkey;
-
        /* Post receive buffers */
-       for (i = 0; i < newxprt->sc_max_requests; i++) {
-               ret = svc_rdma_post_recv(newxprt);
+       for (i = 0; i < newxprt->sc_rq_depth; i++) {
+               ret = svc_rdma_post_recv(newxprt, GFP_KERNEL);
                if (ret) {
                        dprintk("svcrdma: failure posting receive buffers\n");
                        goto errout;
@@ -1160,12 +1262,14 @@ static void __svc_rdma_free(struct work_struct *work)
 {
        struct svcxprt_rdma *rdma =
                container_of(work, struct svcxprt_rdma, sc_work);
-       dprintk("svcrdma: svc_rdma_free(%p)\n", rdma);
+       struct svc_xprt *xprt = &rdma->sc_xprt;
+
+       dprintk("svcrdma: %s(%p)\n", __func__, rdma);
 
        /* We should only be called from kref_put */
-       if (atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0)
+       if (atomic_read(&xprt->xpt_ref.refcount) != 0)
                pr_err("svcrdma: sc_xprt still in use? (%d)\n",
-                      atomic_read(&rdma->sc_xprt.xpt_ref.refcount));
+                      atomic_read(&xprt->xpt_ref.refcount));
 
        /*
         * Destroy queued, but not processed read completions. Note
@@ -1193,15 +1297,22 @@ static void __svc_rdma_free(struct work_struct *work)
        }
 
        /* Warn if we leaked a resource or under-referenced */
-       if (atomic_read(&rdma->sc_ctxt_used) != 0)
+       if (rdma->sc_ctxt_used != 0)
                pr_err("svcrdma: ctxt still in use? (%d)\n",
-                      atomic_read(&rdma->sc_ctxt_used));
+                      rdma->sc_ctxt_used);
        if (atomic_read(&rdma->sc_dma_used) != 0)
                pr_err("svcrdma: dma still in use? (%d)\n",
                       atomic_read(&rdma->sc_dma_used));
 
-       /* De-allocate fastreg mr */
+       /* Final put of backchannel client transport */
+       if (xprt->xpt_bc_xprt) {
+               xprt_put(xprt->xpt_bc_xprt);
+               xprt->xpt_bc_xprt = NULL;
+       }
+
        rdma_dealloc_frmr_q(rdma);
+       svc_rdma_destroy_ctxts(rdma);
+       svc_rdma_destroy_maps(rdma);
 
        /* Destroy the QP if present (not a listener) */
        if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
@@ -1213,9 +1324,6 @@ static void __svc_rdma_free(struct work_struct *work)
        if (rdma->sc_rq_cq && !IS_ERR(rdma->sc_rq_cq))
                ib_destroy_cq(rdma->sc_rq_cq);
 
-       if (rdma->sc_phys_mr && !IS_ERR(rdma->sc_phys_mr))
-               ib_dereg_mr(rdma->sc_phys_mr);
-
        if (rdma->sc_pd && !IS_ERR(rdma->sc_pd))
                ib_dealloc_pd(rdma->sc_pd);
 
@@ -1321,7 +1429,9 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
        int length;
        int ret;
 
-       p = alloc_page(GFP_KERNEL | __GFP_NOFAIL);
+       p = alloc_page(GFP_KERNEL);
+       if (!p)
+               return;
        va = page_address(p);
 
        /* XDR encode error */
@@ -1341,7 +1451,7 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
                return;
        }
        atomic_inc(&xprt->sc_dma_used);
-       ctxt->sge[0].lkey = xprt->sc_dma_lkey;
+       ctxt->sge[0].lkey = xprt->sc_pd->local_dma_lkey;
        ctxt->sge[0].length = length;
 
        /* Prepare SEND WR */
index 740bddcf3488fe9c2e6db48f0b7967909a4caca0..b1b009f10ea375a3f63148973e58e486df1aa282 100644 (file)
@@ -63,7 +63,7 @@
  */
 
 static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE;
-static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
+unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
 static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
 static unsigned int xprt_rdma_inline_write_padding;
 static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
@@ -143,12 +143,7 @@ static struct ctl_table sunrpc_table[] = {
 
 #endif
 
-#define RPCRDMA_BIND_TO                (60U * HZ)
-#define RPCRDMA_INIT_REEST_TO  (5U * HZ)
-#define RPCRDMA_MAX_REEST_TO   (30U * HZ)
-#define RPCRDMA_IDLE_DISC_TO   (5U * 60 * HZ)
-
-static struct rpc_xprt_ops xprt_rdma_procs;    /* forward reference */
+static struct rpc_xprt_ops xprt_rdma_procs;    /*forward reference */
 
 static void
 xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap)
@@ -174,7 +169,7 @@ xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap)
        xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA6;
 }
 
-static void
+void
 xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap)
 {
        char buf[128];
@@ -203,7 +198,7 @@ xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap)
        xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma";
 }
 
-static void
+void
 xprt_rdma_free_addresses(struct rpc_xprt *xprt)
 {
        unsigned int i;
@@ -499,7 +494,7 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
        if (req == NULL)
                return NULL;
 
-       flags = GFP_NOIO | __GFP_NOWARN;
+       flags = RPCRDMA_DEF_GFP;
        if (RPC_IS_SWAPPER(task))
                flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN;
 
@@ -642,7 +637,7 @@ drop_connection:
        return -ENOTCONN;       /* implies disconnect */
 }
 
-static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
+void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
 {
        struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
        long idle_time = 0;
@@ -743,6 +738,11 @@ void xprt_rdma_cleanup(void)
 
        rpcrdma_destroy_wq();
        frwr_destroy_recovery_wq();
+
+       rc = xprt_unregister_transport(&xprt_rdma_bc);
+       if (rc)
+               dprintk("RPC:       %s: xprt_unregister(bc) returned %i\n",
+                       __func__, rc);
 }
 
 int xprt_rdma_init(void)
@@ -766,6 +766,14 @@ int xprt_rdma_init(void)
                return rc;
        }
 
+       rc = xprt_register_transport(&xprt_rdma_bc);
+       if (rc) {
+               xprt_unregister_transport(&xprt_rdma);
+               rpcrdma_destroy_wq();
+               frwr_destroy_recovery_wq();
+               return rc;
+       }
+
        dprintk("RPCRDMA Module Init, register RPC RDMA transport\n");
 
        dprintk("Defaults:\n");
index 732c71ce5dcab6758da0fe7661b877bfd8fa77ad..878f1bfb1db98f6972e641130d36a80cdf97faf8 100644 (file)
@@ -462,7 +462,6 @@ int
 rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 {
        struct rpcrdma_ia *ia = &xprt->rx_ia;
-       struct ib_device_attr *devattr = &ia->ri_devattr;
        int rc;
 
        ia->ri_dma_mr = NULL;
@@ -482,16 +481,10 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
                goto out2;
        }
 
-       rc = ib_query_device(ia->ri_device, devattr);
-       if (rc) {
-               dprintk("RPC:       %s: ib_query_device failed %d\n",
-                       __func__, rc);
-               goto out3;
-       }
-
        if (memreg == RPCRDMA_FRMR) {
-               if (!(devattr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) ||
-                   (devattr->max_fast_reg_page_list_len == 0)) {
+               if (!(ia->ri_device->attrs.device_cap_flags &
+                               IB_DEVICE_MEM_MGT_EXTENSIONS) ||
+                   (ia->ri_device->attrs.max_fast_reg_page_list_len == 0)) {
                        dprintk("RPC:       %s: FRMR registration "
                                "not supported by HCA\n", __func__);
                        memreg = RPCRDMA_MTHCAFMR;
@@ -566,24 +559,23 @@ int
 rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
                                struct rpcrdma_create_data_internal *cdata)
 {
-       struct ib_device_attr *devattr = &ia->ri_devattr;
        struct ib_cq *sendcq, *recvcq;
        struct ib_cq_init_attr cq_attr = {};
        unsigned int max_qp_wr;
        int rc, err;
 
-       if (devattr->max_sge < RPCRDMA_MAX_IOVS) {
+       if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_IOVS) {
                dprintk("RPC:       %s: insufficient sge's available\n",
                        __func__);
                return -ENOMEM;
        }
 
-       if (devattr->max_qp_wr <= RPCRDMA_BACKWARD_WRS) {
+       if (ia->ri_device->attrs.max_qp_wr <= RPCRDMA_BACKWARD_WRS) {
                dprintk("RPC:       %s: insufficient wqe's available\n",
                        __func__);
                return -ENOMEM;
        }
-       max_qp_wr = devattr->max_qp_wr - RPCRDMA_BACKWARD_WRS;
+       max_qp_wr = ia->ri_device->attrs.max_qp_wr - RPCRDMA_BACKWARD_WRS;
 
        /* check provider's send/recv wr limits */
        if (cdata->max_requests > max_qp_wr)
@@ -668,11 +660,11 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
 
        /* Client offers RDMA Read but does not initiate */
        ep->rep_remote_cma.initiator_depth = 0;
-       if (devattr->max_qp_rd_atom > 32)       /* arbitrary but <= 255 */
+       if (ia->ri_device->attrs.max_qp_rd_atom > 32)   /* arbitrary but <= 255 */
                ep->rep_remote_cma.responder_resources = 32;
        else
                ep->rep_remote_cma.responder_resources =
-                                               devattr->max_qp_rd_atom;
+                                               ia->ri_device->attrs.max_qp_rd_atom;
 
        ep->rep_remote_cma.retry_count = 7;
        ep->rep_remote_cma.flow_control = 0;
index 728101ddc44bfc4b3b6247c8d13f3222cb0f1386..38fe11b0987528c3d345676202db487cae269a87 100644 (file)
 #define RDMA_RESOLVE_TIMEOUT   (5000)  /* 5 seconds */
 #define RDMA_CONNECT_RETRY_MAX (2)     /* retries if no listener backlog */
 
+#define RPCRDMA_BIND_TO                (60U * HZ)
+#define RPCRDMA_INIT_REEST_TO  (5U * HZ)
+#define RPCRDMA_MAX_REEST_TO   (30U * HZ)
+#define RPCRDMA_IDLE_DISC_TO   (5U * 60 * HZ)
+
 /*
  * Interface Adapter -- one per transport instance
  */
@@ -68,7 +73,6 @@ struct rpcrdma_ia {
        struct completion       ri_done;
        int                     ri_async_rc;
        unsigned int            ri_max_frmr_depth;
-       struct ib_device_attr   ri_devattr;
        struct ib_qp_attr       ri_qp_attr;
        struct ib_qp_init_attr  ri_qp_init_attr;
 };
@@ -142,6 +146,8 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
        return (struct rpcrdma_msg *)rb->rg_base;
 }
 
+#define RPCRDMA_DEF_GFP                (GFP_NOIO | __GFP_NOWARN)
+
 /*
  * struct rpcrdma_rep -- this structure encapsulates state required to recv
  * and complete a reply, asychronously. It needs several pieces of
@@ -309,6 +315,8 @@ struct rpcrdma_buffer {
        u32                     rb_bc_srv_max_requests;
        spinlock_t              rb_reqslock;    /* protect rb_allreqs */
        struct list_head        rb_allreqs;
+
+       u32                     rb_bc_max_requests;
 };
 #define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
 
@@ -516,6 +524,10 @@ int rpcrdma_marshal_req(struct rpc_rqst *);
 
 /* RPC/RDMA module init - xprtrdma/transport.c
  */
+extern unsigned int xprt_rdma_max_inline_read;
+void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap);
+void xprt_rdma_free_addresses(struct rpc_xprt *xprt);
+void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq);
 int xprt_rdma_init(void);
 void xprt_rdma_cleanup(void);
 
@@ -531,11 +543,6 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *);
 void xprt_rdma_bc_destroy(struct rpc_xprt *, unsigned int);
 #endif /* CONFIG_SUNRPC_BACKCHANNEL */
 
-/* Temporary NFS request map cache. Created in svc_rdma.c  */
-extern struct kmem_cache *svc_rdma_map_cachep;
-/* WR context cache. Created in svc_rdma.c  */
-extern struct kmem_cache *svc_rdma_ctxt_cachep;
-/* Workqueue created in svc_rdma.c */
-extern struct workqueue_struct *svc_rdma_wq;
+extern struct xprt_class xprt_rdma_bc;
 
 #endif                         /* _LINUX_SUNRPC_XPRT_RDMA_H */
index 16622aef9bdea83bee67e5e0080c7b9a17d240ae..28414b0207ce58f1fdd8503da0f61a39ba5c7cfe 100644 (file)
@@ -99,7 +99,7 @@ struct dentry *securityfs_create_file(const char *name, umode_t mode,
 
        dir = d_inode(parent);
 
-       mutex_lock(&dir->i_mutex);
+       inode_lock(dir);
        dentry = lookup_one_len(name, parent, strlen(name));
        if (IS_ERR(dentry))
                goto out;
@@ -129,14 +129,14 @@ struct dentry *securityfs_create_file(const char *name, umode_t mode,
        }
        d_instantiate(dentry, inode);
        dget(dentry);
-       mutex_unlock(&dir->i_mutex);
+       inode_unlock(dir);
        return dentry;
 
 out1:
        dput(dentry);
        dentry = ERR_PTR(error);
 out:
-       mutex_unlock(&dir->i_mutex);
+       inode_unlock(dir);
        simple_release_fs(&mount, &mount_count);
        return dentry;
 }
@@ -195,7 +195,7 @@ void securityfs_remove(struct dentry *dentry)
        if (!parent || d_really_is_negative(parent))
                return;
 
-       mutex_lock(&d_inode(parent)->i_mutex);
+       inode_lock(d_inode(parent));
        if (simple_positive(dentry)) {
                if (d_is_dir(dentry))
                        simple_rmdir(d_inode(parent), dentry);
@@ -203,7 +203,7 @@ void securityfs_remove(struct dentry *dentry)
                        simple_unlink(d_inode(parent), dentry);
                dput(dentry);
        }
-       mutex_unlock(&d_inode(parent)->i_mutex);
+       inode_unlock(d_inode(parent));
        simple_release_fs(&mount, &mount_count);
 }
 EXPORT_SYMBOL_GPL(securityfs_remove);
index c21f09bf8b99210f3c3dcbff4497e1194848db2c..9d96551d0196cb3e7136ace6e747f9504c07fc7a 100644 (file)
@@ -121,7 +121,7 @@ static void ima_check_last_writer(struct integrity_iint_cache *iint,
        if (!(mode & FMODE_WRITE))
                return;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
        if (atomic_read(&inode->i_writecount) == 1) {
                if ((iint->version != inode->i_version) ||
                    (iint->flags & IMA_NEW_FILE)) {
@@ -130,7 +130,7 @@ static void ima_check_last_writer(struct integrity_iint_cache *iint,
                                ima_update_xattr(iint, file);
                }
        }
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
 }
 
 /**
@@ -186,7 +186,7 @@ static int process_measurement(struct file *file, int mask, int function,
        if (action & IMA_FILE_APPRAISE)
                function = FILE_CHECK;
 
-       mutex_lock(&inode->i_mutex);
+       inode_lock(inode);
 
        if (action) {
                iint = integrity_inode_get(inode);
@@ -250,7 +250,7 @@ out_free:
        if (pathbuf)
                __putname(pathbuf);
 out:
-       mutex_unlock(&inode->i_mutex);
+       inode_unlock(inode);
        if ((rc && must_appraise) && (ima_appraise & IMA_APPRAISE_ENFORCE))
                return -EACCES;
        return 0;
index 732c1c77dccd8be483d382e621569e4008b4db0c..1b1fd27de632692dc307e5a72cf65898f0e245b3 100644 (file)
@@ -380,9 +380,9 @@ static int sel_open_policy(struct inode *inode, struct file *filp)
                goto err;
 
        if (i_size_read(inode) != security_policydb_len()) {
-               mutex_lock(&inode->i_mutex);
+               inode_lock(inode);
                i_size_write(inode, security_policydb_len());
-               mutex_unlock(&inode->i_mutex);
+               inode_unlock(inode);
        }
 
        rc = security_read_policy(&plm->data, &plm->len);
index 196a6fe100ca8f40cc7f9b01eb00dc81c713ba93..a85d45595d02a265f1f8e1a4cd36c812d8cd08f0 100644 (file)
@@ -1405,6 +1405,8 @@ static int snd_ctl_tlv_ioctl(struct snd_ctl_file *file,
                return -EFAULT;
        if (tlv.length < sizeof(unsigned int) * 2)
                return -EINVAL;
+       if (!tlv.numid)
+               return -EINVAL;
        down_read(&card->controls_rwsem);
        kctl = snd_ctl_find_numid(card, tlv.numid);
        if (kctl == NULL) {
index f845ecf7e172935f938bc52ecfe9c95c7bac4e11..656d9a9032dc2165d1f30f4ef193250a07bec555 100644 (file)
@@ -90,7 +90,7 @@ static int snd_hrtimer_start(struct snd_timer *t)
        struct snd_hrtimer *stime = t->private_data;
 
        atomic_set(&stime->running, 0);
-       hrtimer_cancel(&stime->hrt);
+       hrtimer_try_to_cancel(&stime->hrt);
        hrtimer_start(&stime->hrt, ns_to_ktime(t->sticks * resolution),
                      HRTIMER_MODE_REL);
        atomic_set(&stime->running, 1);
@@ -101,6 +101,7 @@ static int snd_hrtimer_stop(struct snd_timer *t)
 {
        struct snd_hrtimer *stime = t->private_data;
        atomic_set(&stime->running, 0);
+       hrtimer_try_to_cancel(&stime->hrt);
        return 0;
 }
 
index b48b434444ed0e0239936887a891d15da0054e00..9630e9f72b7ba2ad77f50a516ecc559c0c200975 100644 (file)
@@ -255,10 +255,15 @@ static int snd_pcm_ioctl_hw_params_compat(struct snd_pcm_substream *substream,
        if (! (runtime = substream->runtime))
                return -ENOTTY;
 
-       /* only fifo_size is different, so just copy all */
-       data = memdup_user(data32, sizeof(*data32));
-       if (IS_ERR(data))
-               return PTR_ERR(data);
+       data = kmalloc(sizeof(*data), GFP_KERNEL);
+       if (!data)
+               return -ENOMEM;
+
+       /* only fifo_size (RO from userspace) is different, so just copy all */
+       if (copy_from_user(data, data32, sizeof(*data32))) {
+               err = -EFAULT;
+               goto error;
+       }
 
        if (refine)
                err = snd_pcm_hw_refine(substream, data);
index 81f7c109dc46e1b8ef9b4a2a67477fdf72940ef6..65175902a68a841650ac818f8fde713eca60c5b5 100644 (file)
@@ -49,11 +49,12 @@ static int snd_seq_call_port_info_ioctl(struct snd_seq_client *client, unsigned
        struct snd_seq_port_info *data;
        mm_segment_t fs;
 
-       data = memdup_user(data32, sizeof(*data32));
-       if (IS_ERR(data))
-               return PTR_ERR(data);
+       data = kmalloc(sizeof(*data), GFP_KERNEL);
+       if (!data)
+               return -ENOMEM;
 
-       if (get_user(data->flags, &data32->flags) ||
+       if (copy_from_user(data, data32, sizeof(*data32)) ||
+           get_user(data->flags, &data32->flags) ||
            get_user(data->time_queue, &data32->time_queue))
                goto error;
        data->kernel = NULL;
index cb25aded53499cdf86ceb954e2a4ffdca4913595..af1f68f7e315334202f191e3f049f044de9495f7 100644 (file)
@@ -65,6 +65,7 @@ struct snd_timer_user {
        int qtail;
        int qused;
        int queue_size;
+       bool disconnected;
        struct snd_timer_read *queue;
        struct snd_timer_tread *tqueue;
        spinlock_t qlock;
@@ -290,6 +291,9 @@ int snd_timer_open(struct snd_timer_instance **ti,
                mutex_unlock(&register_mutex);
                return -ENOMEM;
        }
+       /* take a card refcount for safe disconnection */
+       if (timer->card)
+               get_device(&timer->card->card_dev);
        timeri->slave_class = tid->dev_sclass;
        timeri->slave_id = slave_id;
        if (list_empty(&timer->open_list_head) && timer->hw.open)
@@ -359,6 +363,9 @@ int snd_timer_close(struct snd_timer_instance *timeri)
                }
                spin_unlock(&timer->lock);
                spin_unlock_irq(&slave_active_lock);
+               /* release a card refcount for safe disconnection */
+               if (timer->card)
+                       put_device(&timer->card->card_dev);
                mutex_unlock(&register_mutex);
        }
  out:
@@ -474,6 +481,8 @@ int snd_timer_start(struct snd_timer_instance *timeri, unsigned int ticks)
        timer = timeri->timer;
        if (timer == NULL)
                return -EINVAL;
+       if (timer->card && timer->card->shutdown)
+               return -ENODEV;
        spin_lock_irqsave(&timer->lock, flags);
        timeri->ticks = timeri->cticks = ticks;
        timeri->pticks = 0;
@@ -505,6 +514,10 @@ static int _snd_timer_stop(struct snd_timer_instance *timeri, int event)
        spin_lock_irqsave(&timer->lock, flags);
        list_del_init(&timeri->ack_list);
        list_del_init(&timeri->active_list);
+       if (timer->card && timer->card->shutdown) {
+               spin_unlock_irqrestore(&timer->lock, flags);
+               return 0;
+       }
        if ((timeri->flags & SNDRV_TIMER_IFLG_RUNNING) &&
            !(--timer->running)) {
                timer->hw.stop(timer);
@@ -565,6 +578,8 @@ int snd_timer_continue(struct snd_timer_instance *timeri)
        timer = timeri->timer;
        if (! timer)
                return -EINVAL;
+       if (timer->card && timer->card->shutdown)
+               return -ENODEV;
        spin_lock_irqsave(&timer->lock, flags);
        if (!timeri->cticks)
                timeri->cticks = 1;
@@ -628,6 +643,9 @@ static void snd_timer_tasklet(unsigned long arg)
        unsigned long resolution, ticks;
        unsigned long flags;
 
+       if (timer->card && timer->card->shutdown)
+               return;
+
        spin_lock_irqsave(&timer->lock, flags);
        /* now process all callbacks */
        while (!list_empty(&timer->sack_list_head)) {
@@ -668,6 +686,9 @@ void snd_timer_interrupt(struct snd_timer * timer, unsigned long ticks_left)
        if (timer == NULL)
                return;
 
+       if (timer->card && timer->card->shutdown)
+               return;
+
        spin_lock_irqsave(&timer->lock, flags);
 
        /* remember the current resolution */
@@ -881,8 +902,15 @@ static int snd_timer_dev_register(struct snd_device *dev)
 static int snd_timer_dev_disconnect(struct snd_device *device)
 {
        struct snd_timer *timer = device->device_data;
+       struct snd_timer_instance *ti;
+
        mutex_lock(&register_mutex);
        list_del_init(&timer->device_list);
+       /* wake up pending sleepers */
+       list_for_each_entry(ti, &timer->open_list_head, open_list) {
+               if (ti->disconnect)
+                       ti->disconnect(ti);
+       }
        mutex_unlock(&register_mutex);
        return 0;
 }
@@ -893,6 +921,8 @@ void snd_timer_notify(struct snd_timer *timer, int event, struct timespec *tstam
        unsigned long resolution = 0;
        struct snd_timer_instance *ti, *ts;
 
+       if (timer->card && timer->card->shutdown)
+               return;
        if (! (timer->hw.flags & SNDRV_TIMER_HW_SLAVE))
                return;
        if (snd_BUG_ON(event < SNDRV_TIMER_EVENT_MSTART ||
@@ -1051,6 +1081,8 @@ static void snd_timer_proc_read(struct snd_info_entry *entry,
 
        mutex_lock(&register_mutex);
        list_for_each_entry(timer, &snd_timer_list, device_list) {
+               if (timer->card && timer->card->shutdown)
+                       continue;
                switch (timer->tmr_class) {
                case SNDRV_TIMER_CLASS_GLOBAL:
                        snd_iprintf(buffer, "G%i: ", timer->tmr_device);
@@ -1185,6 +1217,14 @@ static void snd_timer_user_ccallback(struct snd_timer_instance *timeri,
        wake_up(&tu->qchange_sleep);
 }
 
+static void snd_timer_user_disconnect(struct snd_timer_instance *timeri)
+{
+       struct snd_timer_user *tu = timeri->callback_data;
+
+       tu->disconnected = true;
+       wake_up(&tu->qchange_sleep);
+}
+
 static void snd_timer_user_tinterrupt(struct snd_timer_instance *timeri,
                                      unsigned long resolution,
                                      unsigned long ticks)
@@ -1558,6 +1598,7 @@ static int snd_timer_user_tselect(struct file *file,
                        ? snd_timer_user_tinterrupt : snd_timer_user_interrupt;
                tu->timeri->ccallback = snd_timer_user_ccallback;
                tu->timeri->callback_data = (void *)tu;
+               tu->timeri->disconnect = snd_timer_user_disconnect;
        }
 
       __err:
@@ -1876,6 +1917,10 @@ static ssize_t snd_timer_user_read(struct file *file, char __user *buffer,
 
                        remove_wait_queue(&tu->qchange_sleep, &wait);
 
+                       if (tu->disconnected) {
+                               err = -ENODEV;
+                               break;
+                       }
                        if (signal_pending(current)) {
                                err = -ERESTARTSYS;
                                break;
@@ -1925,6 +1970,8 @@ static unsigned int snd_timer_user_poll(struct file *file, poll_table * wait)
        mask = 0;
        if (tu->qused)
                mask |= POLLIN | POLLRDNORM;
+       if (tu->disconnected)
+               mask |= POLLERR;
 
        return mask;
 }
index c50177fb469f5f67974c853b109c07a68ef2524d..f6854dbd7d8d0eed41b5158c25b20e03211ccac5 100644 (file)
@@ -306,7 +306,7 @@ out_master_del:
 out_err:
        kfree(acomp);
        bus->audio_component = NULL;
-       dev_err(dev, "failed to add i915 component master (%d)\n", ret);
+       dev_info(dev, "failed to add i915 component master (%d)\n", ret);
 
        return ret;
 }
index 70671ad65d24565fb6c8522bd69444817aac2933..6efadbfb3fe3511b4726541cdd9ebd4aae6cd65d 100644 (file)
@@ -174,14 +174,40 @@ static inline bool codec_probed(struct hda_codec *codec)
        return device_attach(hda_codec_dev(codec)) > 0 && codec->preset;
 }
 
-/* try to auto-load and bind the codec module */
-static void codec_bind_module(struct hda_codec *codec)
+/* try to auto-load codec module */
+static void request_codec_module(struct hda_codec *codec)
 {
 #ifdef MODULE
        char modalias[32];
+       const char *mod = NULL;
+
+       switch (codec->probe_id) {
+       case HDA_CODEC_ID_GENERIC_HDMI:
+#if IS_MODULE(CONFIG_SND_HDA_CODEC_HDMI)
+               mod = "snd-hda-codec-hdmi";
+#endif
+               break;
+       case HDA_CODEC_ID_GENERIC:
+#if IS_MODULE(CONFIG_SND_HDA_GENERIC)
+               mod = "snd-hda-codec-generic";
+#endif
+               break;
+       default:
+               snd_hdac_codec_modalias(&codec->core, modalias, sizeof(modalias));
+               mod = modalias;
+               break;
+       }
+
+       if (mod)
+               request_module(mod);
+#endif /* MODULE */
+}
 
-       snd_hdac_codec_modalias(&codec->core, modalias, sizeof(modalias));
-       request_module(modalias);
+/* try to auto-load and bind the codec module */
+static void codec_bind_module(struct hda_codec *codec)
+{
+#ifdef MODULE
+       request_codec_module(codec);
        if (codec_probed(codec))
                return;
 #endif
@@ -218,17 +244,13 @@ static int codec_bind_generic(struct hda_codec *codec)
 
        if (is_likely_hdmi_codec(codec)) {
                codec->probe_id = HDA_CODEC_ID_GENERIC_HDMI;
-#if IS_MODULE(CONFIG_SND_HDA_CODEC_HDMI)
-               request_module("snd-hda-codec-hdmi");
-#endif
+               request_codec_module(codec);
                if (codec_probed(codec))
                        return 0;
        }
 
        codec->probe_id = HDA_CODEC_ID_GENERIC;
-#if IS_MODULE(CONFIG_SND_HDA_GENERIC)
-       request_module("snd-hda-codec-generic");
-#endif
+       request_codec_module(codec);
        if (codec_probed(codec))
                return 0;
        return -ENODEV;
index c0bef11afa7e2cc01138a46708f4d6e9cd4bd529..256e6cda218f939e6e6d09201c10d39827825ce9 100644 (file)
@@ -2078,9 +2078,11 @@ static int azx_probe_continue(struct azx *chip)
                         * for other chips, still continue probing as other
                         * codecs can be on the same link.
                         */
-                       if (CONTROLLER_IN_GPU(pci))
+                       if (CONTROLLER_IN_GPU(pci)) {
+                               dev_err(chip->card->dev,
+                                       "HSW/BDW HD-audio HDMI/DP requires binding with gfx driver\n");
                                goto out_free;
-                       else
+                       else
                                goto skip_i915;
                }
 
@@ -2149,9 +2151,17 @@ i915_power_fail:
 static void azx_remove(struct pci_dev *pci)
 {
        struct snd_card *card = pci_get_drvdata(pci);
+       struct azx *chip;
+       struct hda_intel *hda;
+
+       if (card) {
+               /* flush the pending probing work */
+               chip = card->private_data;
+               hda = container_of(chip, struct hda_intel, chip);
+               flush_work(&hda->probe_work);
 
-       if (card)
                snd_card_free(card);
+       }
 }
 
 static void azx_shutdown(struct pci_dev *pci)
index 8143c0e24a27ea99bd212bfd33f2bdf805424885..33753244f48fe98de3a57709de2793abb74f2ae0 100644 (file)
@@ -6566,6 +6566,7 @@ static const struct snd_pci_quirk alc662_fixup_tbl[] = {
        SND_PCI_QUIRK(0x1028, 0x069f, "Dell", ALC668_FIXUP_DELL_MIC_NO_PRESENCE),
        SND_PCI_QUIRK(0x103c, 0x1632, "HP RP5800", ALC662_FIXUP_HP_RP5800),
        SND_PCI_QUIRK(0x1043, 0x11cd, "Asus N550", ALC662_FIXUP_BASS_1A),
+       SND_PCI_QUIRK(0x1043, 0x13df, "Asus N550JX", ALC662_FIXUP_BASS_1A),
        SND_PCI_QUIRK(0x1043, 0x1477, "ASUS N56VZ", ALC662_FIXUP_BASS_MODE4_CHMAP),
        SND_PCI_QUIRK(0x1043, 0x15a7, "ASUS UX51VZH", ALC662_FIXUP_BASS_16),
        SND_PCI_QUIRK(0x1043, 0x1b73, "ASUS N55SF", ALC662_FIXUP_BASS_16),
index 39522367897caedb4be23f1f92579d31a4d1fe38..fac7e6eb9529c26d2bb3e30ceaba11b42f55d15b 100644 (file)
@@ -221,6 +221,8 @@ static int snd_at73c213_pcm_open(struct snd_pcm_substream *substream)
        runtime->hw = snd_at73c213_playback_hw;
        chip->substream = substream;
 
+       clk_enable(chip->ssc->clk);
+
        return 0;
 }
 
@@ -228,6 +230,7 @@ static int snd_at73c213_pcm_close(struct snd_pcm_substream *substream)
 {
        struct snd_at73c213 *chip = snd_pcm_substream_chip(substream);
        chip->substream = NULL;
+       clk_disable(chip->ssc->clk);
        return 0;
 }
 
@@ -897,6 +900,8 @@ static int snd_at73c213_dev_init(struct snd_card *card,
        chip->card = card;
        chip->irq = -1;
 
+       clk_enable(chip->ssc->clk);
+
        retval = request_irq(irq, snd_at73c213_interrupt, 0, "at73c213", chip);
        if (retval) {
                dev_dbg(&chip->spi->dev, "unable to request irq %d\n", irq);
@@ -935,6 +940,8 @@ out_irq:
        free_irq(chip->irq, chip);
        chip->irq = -1;
 out:
+       clk_disable(chip->ssc->clk);
+
        return retval;
 }
 
@@ -1012,7 +1019,9 @@ static int snd_at73c213_remove(struct spi_device *spi)
        int retval;
 
        /* Stop playback. */
+       clk_enable(chip->ssc->clk);
        ssc_writel(chip->ssc->regs, CR, SSC_BIT(CR_TXDIS));
+       clk_disable(chip->ssc->clk);
 
        /* Mute sound. */
        retval = snd_at73c213_write_reg(chip, DAC_LMPG, 0x3f);
@@ -1080,6 +1089,7 @@ static int snd_at73c213_suspend(struct device *dev)
        struct snd_at73c213 *chip = card->private_data;
 
        ssc_writel(chip->ssc->regs, CR, SSC_BIT(CR_TXDIS));
+       clk_disable(chip->ssc->clk);
        clk_disable(chip->board->dac_clk);
 
        return 0;
@@ -1091,6 +1101,7 @@ static int snd_at73c213_resume(struct device *dev)
        struct snd_at73c213 *chip = card->private_data;
 
        clk_enable(chip->board->dac_clk);
+       clk_enable(chip->ssc->clk);
        ssc_writel(chip->ssc->regs, CR, SSC_BIT(CR_TXEN));
 
        return 0;
index ea69ce35e902a828b40839cfd7beb14bb9ff29c8..c3bd294a63d1f7e4a75b2a950bbe398f33703cd6 100644 (file)
@@ -3746,7 +3746,7 @@ static const struct flag flags[] = {
        { "NET_TX_SOFTIRQ", 2 },
        { "NET_RX_SOFTIRQ", 3 },
        { "BLOCK_SOFTIRQ", 4 },
-       { "BLOCK_IOPOLL_SOFTIRQ", 5 },
+       { "IRQ_POLL_SOFTIRQ", 5 },
        { "TASKLET_SOFTIRQ", 6 },
        { "SCHED_SOFTIRQ", 7 },
        { "HRTIMER_SOFTIRQ", 8 },
index 8ff7d620d9427490f529cf7ec63471ae1bcb3b17..33b52eaa39db293b0de4d41c7312f88b39a2c4e2 100644 (file)
@@ -209,7 +209,7 @@ static const struct flag flags[] = {
        { "NET_TX_SOFTIRQ", 2 },
        { "NET_RX_SOFTIRQ", 3 },
        { "BLOCK_SOFTIRQ", 4 },
-       { "BLOCK_IOPOLL_SOFTIRQ", 5 },
+       { "IRQ_POLL_SOFTIRQ", 5 },
        { "TASKLET_SOFTIRQ", 6 },
        { "SCHED_SOFTIRQ", 7 },
        { "HRTIMER_SOFTIRQ", 8 },