changeset 12626:450b062c5f06

Add aarch64 support - Part 1 This patch add some common assembly optimization function for aarch64 platform. These function won't work until the patch Part 2 is merged.
author wangxiyuan <wangxiyuan@huawei.com>
date Thu, 19 Mar 2020 13:33:01 +0530
parents 30eb4de83092
children 36151fb4b72b
files source/common/aarch64/asm-primitives.cpp source/common/aarch64/asm.S source/common/aarch64/ipfilter8.S source/common/aarch64/ipfilter8.h source/common/aarch64/mc-a.S source/common/aarch64/pixel-util.S source/common/aarch64/pixel-util.h source/common/aarch64/pixel.h source/common/aarch64/sad-a.S
diffstat 9 files changed, 1489 insertions(+-), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/aarch64/asm-primitives.cpp	Thu Mar 19 13:33:01 2020 +0530
@@ -0,0 +1,219 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Hongbin Liu <liuhongbin1@huawei.com>
+ *          Yimeng Su <yimeng.su@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "common.h"
+#include "primitives.h"
+#include "x265.h"
+#include "cpu.h"
+
+
+#if defined(__GNUC__)
+#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#endif
+
+#define GCC_4_9_0 40900
+#define GCC_5_1_0 50100
+
+extern "C" {
+#include "pixel.h"
+#include "pixel-util.h"
+#include "ipfilter8.h"
+}
+
+namespace X265_NS {
+// private x265 namespace
+
+
+template<int size>
+void interp_8tap_hv_pp_cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY)
+{
+    ALIGN_VAR_32(int16_t, immed[MAX_CU_SIZE * (MAX_CU_SIZE + NTAPS_LUMA - 1)]);
+    const int halfFilterSize = NTAPS_LUMA >> 1;
+    const int immedStride = MAX_CU_SIZE;
+
+    primitives.pu[size].luma_hps(src, srcStride, immed, immedStride, idxX, 1);
+    primitives.pu[size].luma_vsp(immed + (halfFilterSize - 1) * immedStride, immedStride, dst, dstStride, idxY);
+}
+
+
+/* Temporary workaround because luma_vsp assembly primitive has not been completed
+ * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly primitive.
+ * Otherwise, segment fault occurs. */
+void setupAliasCPrimitives(EncoderPrimitives &cp, EncoderPrimitives &asmp, int cpuMask)
+{
+    if (cpuMask & X265_CPU_NEON)
+    {
+        asmp.pu[LUMA_8x4].luma_vsp   = cp.pu[LUMA_8x4].luma_vsp;
+        asmp.pu[LUMA_8x8].luma_vsp   = cp.pu[LUMA_8x8].luma_vsp;
+        asmp.pu[LUMA_8x16].luma_vsp  = cp.pu[LUMA_8x16].luma_vsp;
+        asmp.pu[LUMA_8x32].luma_vsp  = cp.pu[LUMA_8x32].luma_vsp;
+        asmp.pu[LUMA_12x16].luma_vsp = cp.pu[LUMA_12x16].luma_vsp;
+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0 */
+        asmp.pu[LUMA_16x4].luma_vsp  = cp.pu[LUMA_16x4].luma_vsp;
+        asmp.pu[LUMA_16x8].luma_vsp  = cp.pu[LUMA_16x8].luma_vsp;
+        asmp.pu[LUMA_16x12].luma_vsp = cp.pu[LUMA_16x12].luma_vsp;
+        asmp.pu[LUMA_16x16].luma_vsp = cp.pu[LUMA_16x16].luma_vsp;
+        asmp.pu[LUMA_16x32].luma_vsp = cp.pu[LUMA_16x32].luma_vsp;
+        asmp.pu[LUMA_16x64].luma_vsp = cp.pu[LUMA_16x64].luma_vsp;
+        asmp.pu[LUMA_32x16].luma_vsp = cp.pu[LUMA_32x16].luma_vsp;
+        asmp.pu[LUMA_32x24].luma_vsp = cp.pu[LUMA_32x24].luma_vsp;
+        asmp.pu[LUMA_32x32].luma_vsp = cp.pu[LUMA_32x32].luma_vsp;
+        asmp.pu[LUMA_32x64].luma_vsp = cp.pu[LUMA_32x64].luma_vsp;
+        asmp.pu[LUMA_48x64].luma_vsp = cp.pu[LUMA_48x64].luma_vsp;
+        asmp.pu[LUMA_64x16].luma_vsp = cp.pu[LUMA_64x16].luma_vsp;
+        asmp.pu[LUMA_64x32].luma_vsp = cp.pu[LUMA_64x32].luma_vsp;
+        asmp.pu[LUMA_64x48].luma_vsp = cp.pu[LUMA_64x48].luma_vsp;
+        asmp.pu[LUMA_64x64].luma_vsp = cp.pu[LUMA_64x64].luma_vsp;    
+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_4_9_0 /* gcc_version < gcc-4.9.0 */
+        asmp.pu[LUMA_4x4].luma_vsp   = cp.pu[LUMA_4x4].luma_vsp;
+        asmp.pu[LUMA_4x8].luma_vsp   = cp.pu[LUMA_4x8].luma_vsp;
+        asmp.pu[LUMA_4x16].luma_vsp  = cp.pu[LUMA_4x16].luma_vsp;
+        asmp.pu[LUMA_24x32].luma_vsp = cp.pu[LUMA_24x32].luma_vsp;
+        asmp.pu[LUMA_32x8].luma_vsp  = cp.pu[LUMA_32x8].luma_vsp;
+#endif
+#endif
+    }
+}
+
+
+void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) 
+{
+    if (cpuMask & X265_CPU_NEON)
+    {
+        p.pu[LUMA_4x4].satd   = PFX(pixel_satd_4x4_neon);
+        p.pu[LUMA_4x8].satd   = PFX(pixel_satd_4x8_neon);
+        p.pu[LUMA_4x16].satd  = PFX(pixel_satd_4x16_neon);
+        p.pu[LUMA_8x4].satd   = PFX(pixel_satd_8x4_neon);
+        p.pu[LUMA_8x8].satd   = PFX(pixel_satd_8x8_neon);
+        p.pu[LUMA_12x16].satd = PFX(pixel_satd_12x16_neon);
+        
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd    = PFX(pixel_satd_4x4_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd    = PFX(pixel_satd_4x8_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd   = PFX(pixel_satd_4x16_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd    = PFX(pixel_satd_8x4_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd    = PFX(pixel_satd_8x8_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd  = PFX(pixel_satd_12x16_neon);
+        
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd    = PFX(pixel_satd_4x4_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd    = PFX(pixel_satd_4x8_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd   = PFX(pixel_satd_4x16_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd   = PFX(pixel_satd_4x32_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd    = PFX(pixel_satd_8x4_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd    = PFX(pixel_satd_8x8_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd  = PFX(pixel_satd_12x32_neon);
+
+        p.pu[LUMA_4x4].pixelavg_pp[NONALIGNED]   = PFX(pixel_avg_pp_4x4_neon);
+        p.pu[LUMA_4x8].pixelavg_pp[NONALIGNED]   = PFX(pixel_avg_pp_4x8_neon);
+        p.pu[LUMA_4x16].pixelavg_pp[NONALIGNED]  = PFX(pixel_avg_pp_4x16_neon);
+        p.pu[LUMA_8x4].pixelavg_pp[NONALIGNED]   = PFX(pixel_avg_pp_8x4_neon);
+        p.pu[LUMA_8x8].pixelavg_pp[NONALIGNED]   = PFX(pixel_avg_pp_8x8_neon);
+        p.pu[LUMA_8x16].pixelavg_pp[NONALIGNED]  = PFX(pixel_avg_pp_8x16_neon);
+        p.pu[LUMA_8x32].pixelavg_pp[NONALIGNED]  = PFX(pixel_avg_pp_8x32_neon);
+
+        p.pu[LUMA_4x4].pixelavg_pp[ALIGNED]   = PFX(pixel_avg_pp_4x4_neon);
+        p.pu[LUMA_4x8].pixelavg_pp[ALIGNED]   = PFX(pixel_avg_pp_4x8_neon);
+        p.pu[LUMA_4x16].pixelavg_pp[ALIGNED]  = PFX(pixel_avg_pp_4x16_neon);
+        p.pu[LUMA_8x4].pixelavg_pp[ALIGNED]   = PFX(pixel_avg_pp_8x4_neon);
+        p.pu[LUMA_8x8].pixelavg_pp[ALIGNED]   = PFX(pixel_avg_pp_8x8_neon);
+        p.pu[LUMA_8x16].pixelavg_pp[ALIGNED]  = PFX(pixel_avg_pp_8x16_neon);
+        p.pu[LUMA_8x32].pixelavg_pp[ALIGNED]  = PFX(pixel_avg_pp_8x32_neon);
+
+        p.pu[LUMA_8x4].sad_x3   = PFX(sad_x3_8x4_neon);
+        p.pu[LUMA_8x8].sad_x3   = PFX(sad_x3_8x8_neon);
+        p.pu[LUMA_8x16].sad_x3  = PFX(sad_x3_8x16_neon);
+        p.pu[LUMA_8x32].sad_x3  = PFX(sad_x3_8x32_neon);
+
+        p.pu[LUMA_8x4].sad_x4   = PFX(sad_x4_8x4_neon);
+        p.pu[LUMA_8x8].sad_x4   = PFX(sad_x4_8x8_neon);
+        p.pu[LUMA_8x16].sad_x4  = PFX(sad_x4_8x16_neon);
+        p.pu[LUMA_8x32].sad_x4  = PFX(sad_x4_8x32_neon);
+
+        // quant
+        p.quant = PFX(quant_neon);
+        // luma_hps
+        p.pu[LUMA_4x4].luma_hps   = PFX(interp_8tap_horiz_ps_4x4_neon);
+        p.pu[LUMA_4x8].luma_hps   = PFX(interp_8tap_horiz_ps_4x8_neon);
+        p.pu[LUMA_4x16].luma_hps  = PFX(interp_8tap_horiz_ps_4x16_neon);
+        p.pu[LUMA_8x4].luma_hps   = PFX(interp_8tap_horiz_ps_8x4_neon);
+        p.pu[LUMA_8x8].luma_hps   = PFX(interp_8tap_horiz_ps_8x8_neon);
+        p.pu[LUMA_8x16].luma_hps  = PFX(interp_8tap_horiz_ps_8x16_neon);
+        p.pu[LUMA_8x32].luma_hps  = PFX(interp_8tap_horiz_ps_8x32_neon);
+        p.pu[LUMA_12x16].luma_hps = PFX(interp_8tap_horiz_ps_12x16_neon);
+        p.pu[LUMA_24x32].luma_hps = PFX(interp_8tap_horiz_ps_24x32_neon);
+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0 */
+        p.pu[LUMA_16x4].luma_hps  = PFX(interp_8tap_horiz_ps_16x4_neon);
+        p.pu[LUMA_16x8].luma_hps  = PFX(interp_8tap_horiz_ps_16x8_neon);
+        p.pu[LUMA_16x12].luma_hps = PFX(interp_8tap_horiz_ps_16x12_neon);
+        p.pu[LUMA_16x16].luma_hps = PFX(interp_8tap_horiz_ps_16x16_neon);
+        p.pu[LUMA_16x32].luma_hps = PFX(interp_8tap_horiz_ps_16x32_neon);
+        p.pu[LUMA_16x64].luma_hps = PFX(interp_8tap_horiz_ps_16x64_neon);
+        p.pu[LUMA_32x8].luma_hps  = PFX(interp_8tap_horiz_ps_32x8_neon);
+        p.pu[LUMA_32x16].luma_hps = PFX(interp_8tap_horiz_ps_32x16_neon);
+        p.pu[LUMA_32x24].luma_hps = PFX(interp_8tap_horiz_ps_32x24_neon);
+        p.pu[LUMA_32x32].luma_hps = PFX(interp_8tap_horiz_ps_32x32_neon);
+        p.pu[LUMA_32x64].luma_hps = PFX(interp_8tap_horiz_ps_32x64_neon);
+        p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_neon);
+        p.pu[LUMA_64x16].luma_hps = PFX(interp_8tap_horiz_ps_64x16_neon);
+        p.pu[LUMA_64x32].luma_hps = PFX(interp_8tap_horiz_ps_64x32_neon);
+        p.pu[LUMA_64x48].luma_hps = PFX(interp_8tap_horiz_ps_64x48_neon);
+        p.pu[LUMA_64x64].luma_hps = PFX(interp_8tap_horiz_ps_64x64_neon);
+#endif
+
+        p.pu[LUMA_8x4].luma_hvpp   =  interp_8tap_hv_pp_cpu<LUMA_8x4>;
+        p.pu[LUMA_8x8].luma_hvpp   =  interp_8tap_hv_pp_cpu<LUMA_8x8>;
+        p.pu[LUMA_8x16].luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_8x16>;
+        p.pu[LUMA_8x32].luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_8x32>;
+        p.pu[LUMA_12x16].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_12x16>;
+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0 */
+        p.pu[LUMA_16x4].luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_16x4>;
+        p.pu[LUMA_16x8].luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_16x8>;
+        p.pu[LUMA_16x12].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_16x12>;
+        p.pu[LUMA_16x16].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_16x16>;
+        p.pu[LUMA_16x32].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_16x32>;
+        p.pu[LUMA_16x64].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_16x64>;
+        p.pu[LUMA_32x16].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_32x16>;
+        p.pu[LUMA_32x24].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_32x24>;
+        p.pu[LUMA_32x32].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_32x32>;
+        p.pu[LUMA_32x64].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_32x64>;
+        p.pu[LUMA_48x64].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_48x64>;
+        p.pu[LUMA_64x16].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_64x16>;
+        p.pu[LUMA_64x32].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_64x32>;
+        p.pu[LUMA_64x48].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_64x48>;
+        p.pu[LUMA_64x64].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_64x64>;
+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_4_9_0 /* gcc_version < gcc-4.9.0 */
+        p.pu[LUMA_4x4].luma_hvpp   =  interp_8tap_hv_pp_cpu<LUMA_4x4>;
+        p.pu[LUMA_4x8].luma_hvpp   =  interp_8tap_hv_pp_cpu<LUMA_4x8>;
+        p.pu[LUMA_4x16].luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_4x16>;
+        p.pu[LUMA_24x32].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_24x32>;
+        p.pu[LUMA_32x8].luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_32x8>;
+#endif
+#endif
+
+#if !HIGH_BIT_DEPTH
+        p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_4x4_neon);
+#endif // !HIGH_BIT_DEPTH
+
+    }
+}
+} // namespace X265_NS
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/aarch64/asm.S	Thu Mar 19 13:33:01 2020 +0530
@@ -0,0 +1,69 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Hongbin Liu <liuhongbin1@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+.arch           armv8-a
+
+#ifdef PREFIX
+#define EXTERN_ASM _
+#else
+#define EXTERN_ASM
+#endif
+
+#ifdef __ELF__
+#define ELF
+#else
+#define ELF @
+#endif
+
+#define HAVE_AS_FUNC 1
+
+#if HAVE_AS_FUNC
+#define FUNC
+#else
+#define FUNC @
+#endif
+
+.macro function name, export=1
+    .macro endfunc
+ELF     .size   \name, . - \name
+FUNC    .endfunc
+        .purgem endfunc
+    .endm
+        .align  2
+.if \export == 1
+        .global EXTERN_ASM\name
+ELF     .hidden EXTERN_ASM\name
+ELF     .type   EXTERN_ASM\name, %function
+FUNC    .func   EXTERN_ASM\name
+EXTERN_ASM\name:
+.else
+ELF     .hidden \name
+ELF     .type   \name, %function
+FUNC    .func   \name
+\name:
+.endif
+.endm
+
+
+#define FENC_STRIDE 64
+#define FDEC_STRIDE 32
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/aarch64/ipfilter8.S	Thu Mar 19 13:33:01 2020 +0530
@@ -0,0 +1,414 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Yimeng Su <yimeng.su@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.section .rodata
+
+.align 4
+
+.text
+
+
+
+.macro qpel_filter_0_32b
+    movi            v24.8h, #64
+    uxtl            v19.8h, v5.8b
+    smull           v17.4s, v19.4h, v24.4h
+    smull2          v18.4s, v19.8h, v24.8h
+.endm
+
+.macro qpel_filter_1_32b
+    movi            v16.8h, #58
+    uxtl            v19.8h, v5.8b
+    smull           v17.4s, v19.4h, v16.4h
+    smull2          v18.4s, v19.8h, v16.8h
+
+    movi            v24.8h, #10
+    uxtl            v21.8h, v1.8b
+    smull           v19.4s, v21.4h, v24.4h
+    smull2          v20.4s, v21.8h, v24.8h
+
+    movi            v16.8h, #17
+    uxtl            v23.8h, v2.8b
+    smull           v21.4s, v23.4h, v16.4h
+    smull2          v22.4s, v23.8h, v16.8h
+
+    movi            v24.8h, #5
+    uxtl            v1.8h, v6.8b
+    smull           v23.4s, v1.4h, v24.4h
+    smull2          v16.4s, v1.8h, v24.8h
+
+    sub             v17.4s, v17.4s, v19.4s
+    sub             v18.4s, v18.4s, v20.4s
+
+    uxtl            v1.8h, v4.8b
+    sshll           v19.4s, v1.4h, #2
+    sshll2          v20.4s, v1.8h, #2
+
+    add             v17.4s, v17.4s, v21.4s
+    add             v18.4s, v18.4s, v22.4s
+
+    uxtl            v1.8h, v0.8b
+    uxtl            v2.8h, v3.8b
+    ssubl           v21.4s, v2.4h, v1.4h
+    ssubl2          v22.4s, v2.8h, v1.8h
+
+    add             v17.4s, v17.4s, v19.4s
+    add             v18.4s, v18.4s, v20.4s
+    sub             v21.4s, v21.4s, v23.4s
+    sub             v22.4s, v22.4s, v16.4s
+    add             v17.4s, v17.4s, v21.4s
+    add             v18.4s, v18.4s, v22.4s
+.endm
+
+.macro qpel_filter_2_32b
+    movi            v16.4s, #11
+    uxtl            v19.8h, v5.8b
+    uxtl            v20.8h, v2.8b
+    saddl           v17.4s, v19.4h, v20.4h
+    saddl2          v18.4s, v19.8h, v20.8h
+
+    uxtl            v21.8h, v1.8b
+    uxtl            v22.8h, v6.8b
+    saddl           v19.4s, v21.4h, v22.4h
+    saddl2          v20.4s, v21.8h, v22.8h
+
+    mul             v19.4s, v19.4s, v16.4s
+    mul             v20.4s, v20.4s, v16.4s
+
+    movi            v16.4s, #40
+    mul             v17.4s, v17.4s, v16.4s
+    mul             v18.4s, v18.4s, v16.4s
+
+    uxtl            v21.8h, v4.8b
+    uxtl            v22.8h, v3.8b
+    saddl           v23.4s, v21.4h, v22.4h
+    saddl2          v16.4s, v21.8h, v22.8h
+
+    uxtl            v1.8h, v0.8b
+    uxtl            v2.8h, v7.8b
+    saddl           v21.4s, v1.4h, v2.4h
+    saddl2          v22.4s, v1.8h, v2.8h
+
+    shl             v23.4s, v23.4s, #2
+    shl             v16.4s, v16.4s, #2
+
+    add             v19.4s, v19.4s, v21.4s
+    add             v20.4s, v20.4s, v22.4s
+    add             v17.4s, v17.4s, v23.4s
+    add             v18.4s, v18.4s, v16.4s
+    sub             v17.4s, v17.4s, v19.4s
+    sub             v18.4s, v18.4s, v20.4s
+.endm
+
+.macro qpel_filter_3_32b
+    movi            v16.8h, #17
+    movi            v24.8h, #5
+
+    uxtl            v19.8h, v5.8b
+    smull           v17.4s, v19.4h, v16.4h
+    smull2          v18.4s, v19.8h, v16.8h
+
+    uxtl            v21.8h, v1.8b
+    smull           v19.4s, v21.4h, v24.4h
+    smull2          v20.4s, v21.8h, v24.8h
+
+    movi            v16.8h, #58
+    uxtl            v23.8h, v2.8b
+    smull           v21.4s, v23.4h, v16.4h
+    smull2          v22.4s, v23.8h, v16.8h
+
+    movi            v24.8h, #10
+    uxtl            v1.8h, v6.8b
+    smull           v23.4s, v1.4h, v24.4h
+    smull2          v16.4s, v1.8h, v24.8h
+
+    sub             v17.4s, v17.4s, v19.4s
+    sub             v18.4s, v18.4s, v20.4s
+
+    uxtl            v1.8h, v3.8b
+    sshll           v19.4s, v1.4h, #2
+    sshll2          v20.4s, v1.8h, #2
+
+    add             v17.4s, v17.4s, v21.4s
+    add             v18.4s, v18.4s, v22.4s
+
+    uxtl            v1.8h, v4.8b
+    uxtl            v2.8h, v7.8b
+    ssubl           v21.4s, v1.4h, v2.4h
+    ssubl2          v22.4s, v1.8h, v2.8h
+
+    add             v17.4s, v17.4s, v19.4s
+    add             v18.4s, v18.4s, v20.4s
+    sub             v21.4s, v21.4s, v23.4s
+    sub             v22.4s, v22.4s, v16.4s
+    add             v17.4s, v17.4s, v21.4s
+    add             v18.4s, v18.4s, v22.4s
+.endm
+
+
+
+
+.macro vextin8
+    ld1             {v3.16b}, [x11], #16
+    mov             v7.d[0], v3.d[1]
+    ext             v0.8b, v3.8b, v7.8b, #1
+    ext             v4.8b, v3.8b, v7.8b, #2
+    ext             v1.8b, v3.8b, v7.8b, #3
+    ext             v5.8b, v3.8b, v7.8b, #4
+    ext             v2.8b, v3.8b, v7.8b, #5
+    ext             v6.8b, v3.8b, v7.8b, #6
+    ext             v3.8b, v3.8b, v7.8b, #7
+.endm
+
+
+
+// void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+.macro HPS_FILTER a b filterhps
+    mov             w12, #8192
+    mov             w6, w10
+    sub             x3, x3, #\a
+    lsl             x3, x3, #1
+    mov             w9, #\a
+    cmp             w9, #4
+    b.eq            14f
+    cmp             w9, #12
+    b.eq            15f
+    b               7f
+14:
+    HPS_FILTER_4 \a \b \filterhps
+    b               10f
+15:
+    HPS_FILTER_12 \a \b \filterhps
+    b               10f
+7:
+    cmp             w5, #0
+    b.eq            8f
+    cmp             w5, #1
+    b.eq            9f
+8:
+loop1_hps_\filterhps\()_\a\()x\b\()_rowext0:
+    mov             w7, #\a
+    lsr             w7, w7, #3
+    mov             x11, x0
+    sub             x11, x11, #4
+loop2_hps_\filterhps\()_\a\()x\b\()_rowext0:
+    vextin8
+    \filterhps
+    dup             v16.4s, w12
+    sub             v17.4s, v17.4s, v16.4s
+    sub             v18.4s, v18.4s, v16.4s
+    xtn             v0.4h, v17.4s
+    xtn2            v0.8h, v18.4s
+    st1             {v0.8h}, [x2], #16
+    subs            w7, w7, #1
+    sub             x11, x11, #8
+    b.ne            loop2_hps_\filterhps\()_\a\()x\b\()_rowext0
+    subs            w6, w6, #1
+    add             x0, x0, x1
+    add             x2, x2, x3
+    b.ne            loop1_hps_\filterhps\()_\a\()x\b\()_rowext0
+    b               10f
+9:
+loop3_hps_\filterhps\()_\a\()x\b\()_rowext1:
+    mov             w7, #\a
+    lsr             w7, w7, #3
+    mov             x11, x0
+    sub             x11, x11, #4
+loop4_hps_\filterhps\()_\a\()x\b\()_rowext1:
+    vextin8
+    \filterhps
+    dup             v16.4s, w12
+    sub             v17.4s, v17.4s, v16.4s
+    sub             v18.4s, v18.4s, v16.4s
+    xtn             v0.4h, v17.4s
+    xtn2            v0.8h, v18.4s
+    st1             {v0.8h}, [x2], #16
+    subs            w7, w7, #1
+    sub             x11, x11, #8
+    b.ne            loop4_hps_\filterhps\()_\a\()x\b\()_rowext1
+    subs            w6, w6, #1
+    add             x0, x0, x1
+    add             x2, x2, x3
+    b.ne            loop3_hps_\filterhps\()_\a\()x\b\()_rowext1
+10:
+.endm
+
+.macro HPS_FILTER_4 w h filterhps
+    cmp             w5, #0
+    b.eq            11f
+    cmp             w5, #1
+    b.eq            12f
+11:
+loop4_hps_\filterhps\()_\w\()x\h\()_rowext0:
+    mov             x11, x0
+    sub             x11, x11, #4
+    vextin8
+    \filterhps
+    dup             v16.4s, w12
+    sub             v17.4s, v17.4s, v16.4s
+    xtn             v0.4h, v17.4s
+    st1             {v0.4h}, [x2], #8
+    sub             x11, x11, #8
+    subs            w6, w6, #1
+    add             x0, x0, x1
+    add             x2, x2, x3
+    b.ne            loop4_hps_\filterhps\()_\w\()x\h\()_rowext0
+    b               13f
+12:
+loop5_hps_\filterhps\()_\w\()x\h\()_rowext1:
+    mov             x11, x0
+    sub             x11, x11, #4
+    vextin8
+    \filterhps
+    dup             v16.4s, w12
+    sub             v17.4s, v17.4s, v16.4s
+    xtn             v0.4h, v17.4s
+    st1             {v0.4h}, [x2], #8
+    sub             x11, x11, #8
+    subs            w6, w6, #1
+    add             x0, x0, x1
+    add             x2, x2, x3
+    b.ne            loop5_hps_\filterhps\()_\w\()x\h\()_rowext1
+13:
+.endm
+
+.macro HPS_FILTER_12 w h filterhps
+    cmp             w5, #0
+    b.eq            14f
+    cmp             w5, #1
+    b.eq            15f
+14:
+loop12_hps_\filterhps\()_\w\()x\h\()_rowext0:
+    mov             x11, x0
+    sub             x11, x11, #4
+    vextin8
+    \filterhps
+    dup             v16.4s, w12
+    sub             v17.4s, v17.4s, v16.4s
+    sub             v18.4s, v18.4s, v16.4s
+    xtn             v0.4h, v17.4s
+    xtn2            v0.8h, v18.4s
+    st1             {v0.8h}, [x2], #16
+    sub             x11, x11, #8
+
+    vextin8
+    \filterhps
+    dup             v16.4s, w12
+    sub             v17.4s, v17.4s, v16.4s
+    xtn             v0.4h, v17.4s
+    st1             {v0.4h}, [x2], #8
+    add             x2, x2, x3
+    subs            w6, w6, #1
+    add             x0, x0, x1
+    b.ne            loop12_hps_\filterhps\()_\w\()x\h\()_rowext0
+    b               16f
+15:
+loop12_hps_\filterhps\()_\w\()x\h\()_rowext1:
+    mov             x11, x0
+    sub             x11, x11, #4
+    vextin8
+    \filterhps
+    dup             v16.4s, w12
+    sub             v17.4s, v17.4s, v16.4s
+    sub             v18.4s, v18.4s, v16.4s
+    xtn             v0.4h, v17.4s
+    xtn2            v0.8h, v18.4s
+    st1             {v0.8h}, [x2], #16
+    sub             x11, x11, #8
+
+    vextin8
+    \filterhps
+    dup             v16.4s, w12
+    sub             v17.4s, v17.4s, v16.4s
+    xtn             v0.4h, v17.4s
+    st1             {v0.4h}, [x2], #8
+    add             x2, x2, x3
+    subs            w6, w6, #1
+    add             x0, x0, x1
+    b.ne            loop12_hps_\filterhps\()_\w\()x\h\()_rowext1
+16:
+.endm
+
+// void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+.macro LUMA_HPS w h
+function x265_interp_8tap_horiz_ps_\w\()x\h\()_neon
+    mov             w10, #\h
+    cmp             w5, #0
+    b.eq            6f
+    sub             x0, x0, x1, lsl #2
+
+    add             x0, x0, x1
+    add             w10, w10, #7
+6:
+    cmp             w4, #0
+    b.eq            0f
+    cmp             w4, #1
+    b.eq            1f
+    cmp             w4, #2
+    b.eq            2f
+    cmp             w4, #3
+    b.eq            3f
+0:
+    HPS_FILTER  \w \h qpel_filter_0_32b
+    b               5f
+1:
+    HPS_FILTER  \w \h qpel_filter_1_32b
+    b               5f
+2:
+    HPS_FILTER  \w \h qpel_filter_2_32b
+    b               5f
+3:
+    HPS_FILTER  \w \h qpel_filter_3_32b
+    b               5f
+5:
+    ret
+endfunc
+.endm
+
+LUMA_HPS    4 4
+LUMA_HPS    4 8
+LUMA_HPS    4 16
+LUMA_HPS    8 4
+LUMA_HPS    8 8
+LUMA_HPS    8 16
+LUMA_HPS    8 32
+LUMA_HPS    12 16
+LUMA_HPS    16 4
+LUMA_HPS    16 8
+LUMA_HPS    16 12
+LUMA_HPS    16 16
+LUMA_HPS    16 32
+LUMA_HPS    16 64
+LUMA_HPS    24 32
+LUMA_HPS    32 8
+LUMA_HPS    32 16
+LUMA_HPS    32 24
+LUMA_HPS    32 32
+LUMA_HPS    32 64
+LUMA_HPS    48 64
+LUMA_HPS    64 16
+LUMA_HPS    64 32
+LUMA_HPS    64 48
+LUMA_HPS    64 64
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/aarch64/ipfilter8.h	Thu Mar 19 13:33:01 2020 +0530
@@ -0,0 +1,55 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Yimeng Su <yimeng.su@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_IPFILTER8_AARCH64_H
+#define X265_IPFILTER8_AARCH64_H
+
+
+void x265_interp_8tap_horiz_ps_4x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_4x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_4x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_8x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_8x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_8x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_8x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_12x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_16x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_16x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_16x12_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_16x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_16x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_16x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_24x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_32x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_32x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_32x24_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_32x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_32x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_48x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_64x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_64x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_64x48_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_64x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+
+
+#endif // ifndef X265_IPFILTER8_AARCH64_H
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/aarch64/mc-a.S	Thu Mar 19 13:33:01 2020 +0530
@@ -0,0 +1,63 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Hongbin Liu <liuhongbin1@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.section .rodata
+
+.align 4
+
+.text
+
+.macro pixel_avg_pp_4xN_neon h
+function x265_pixel_avg_pp_4x\h\()_neon
+.rept \h
+    ld1             {v0.s}[0], [x2], x3
+    ld1             {v1.s}[0], [x4], x5
+    urhadd          v2.8b, v0.8b, v1.8b
+    st1             {v2.s}[0], [x0], x1
+.endr
+    ret
+endfunc
+.endm
+
+pixel_avg_pp_4xN_neon 4
+pixel_avg_pp_4xN_neon 8
+pixel_avg_pp_4xN_neon 16
+
+.macro pixel_avg_pp_8xN_neon h
+function x265_pixel_avg_pp_8x\h\()_neon
+.rept \h
+    ld1             {v0.8b}, [x2], x3
+    ld1             {v1.8b}, [x4], x5
+    urhadd          v2.8b, v0.8b, v1.8b
+    st1             {v2.8b}, [x0], x1
+.endr
+    ret
+endfunc
+.endm
+
+pixel_avg_pp_8xN_neon 4
+pixel_avg_pp_8xN_neon 8
+pixel_avg_pp_8xN_neon 16
+pixel_avg_pp_8xN_neon 32
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/aarch64/pixel-util.S	Thu Mar 19 13:33:01 2020 +0530
@@ -0,0 +1,419 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Yimeng Su <yimeng.su@huawei.com>
+ *          Hongbin Liu <liuhongbin1@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.section .rodata
+
+.align 4
+
+.text
+
+.macro x265_satd_4x8_8x4_end_neon
+    add             v0.8h, v4.8h, v6.8h
+    add             v1.8h, v5.8h, v7.8h
+    sub             v2.8h, v4.8h, v6.8h
+    sub             v3.8h, v5.8h, v7.8h
+
+    trn1            v16.8h, v0.8h, v1.8h
+    trn2            v17.8h, v0.8h, v1.8h
+    add             v4.8h, v16.8h, v17.8h
+    trn1            v18.8h, v2.8h, v3.8h
+    trn2            v19.8h, v2.8h, v3.8h
+    sub             v5.8h, v16.8h, v17.8h
+    add             v6.8h, v18.8h, v19.8h
+    sub             v7.8h, v18.8h, v19.8h
+    trn1            v0.4s, v4.4s, v6.4s
+    trn2            v2.4s, v4.4s, v6.4s
+    abs             v0.8h, v0.8h
+    trn1            v1.4s, v5.4s, v7.4s
+    trn2            v3.4s, v5.4s, v7.4s
+    abs             v2.8h, v2.8h
+    abs             v1.8h, v1.8h
+    abs             v3.8h, v3.8h
+    umax            v0.8h, v0.8h, v2.8h
+    umax            v1.8h, v1.8h, v3.8h
+    add             v0.8h, v0.8h, v1.8h
+    uaddlv          s0, v0.8h
+.endm
+
+.macro pixel_satd_4x8_neon
+    ld1r             {v1.2s}, [x2], x3
+    ld1r            {v0.2s}, [x0], x1
+    ld1r            {v3.2s}, [x2], x3
+    ld1r            {v2.2s}, [x0], x1
+    ld1r            {v5.2s}, [x2], x3
+    ld1r            {v4.2s}, [x0], x1
+    ld1r            {v7.2s}, [x2], x3
+    ld1r            {v6.2s}, [x0], x1
+
+    ld1             {v1.s}[1], [x2], x3
+    ld1             {v0.s}[1], [x0], x1
+    usubl           v0.8h, v0.8b, v1.8b
+    ld1             {v3.s}[1], [x2], x3
+    ld1             {v2.s}[1], [x0], x1
+    usubl           v1.8h, v2.8b, v3.8b
+    ld1             {v5.s}[1], [x2], x3
+    ld1             {v4.s}[1], [x0], x1
+    usubl           v2.8h, v4.8b, v5.8b
+    ld1             {v7.s}[1], [x2], x3
+    add             v4.8h, v0.8h, v1.8h
+    sub             v5.8h, v0.8h, v1.8h
+    ld1             {v6.s}[1], [x0], x1
+    usubl           v3.8h, v6.8b, v7.8b
+    add         v6.8h, v2.8h, v3.8h
+    sub         v7.8h, v2.8h, v3.8h
+    x265_satd_4x8_8x4_end_neon
+.endm
+
+// template<int w, int h>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+function x265_pixel_satd_4x8_neon
+    pixel_satd_4x8_neon
+    mov               w0, v0.s[0]
+    ret
+endfunc
+
+// template<int w, int h>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+function x265_pixel_satd_4x16_neon
+    eor             w4, w4, w4
+    pixel_satd_4x8_neon
+    mov               w5, v0.s[0]
+    add             w4, w4, w5
+    pixel_satd_4x8_neon
+    mov               w5, v0.s[0]
+    add             w0, w5, w4
+    ret
+endfunc
+
+// template<int w, int h>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+function x265_pixel_satd_4x32_neon
+    eor             w4, w4, w4
+.rept 4
+    pixel_satd_4x8_neon
+    mov             w5, v0.s[0]
+    add             w4, w4, w5
+.endr
+    mov             w0, w4
+    ret
+endfunc
+
+// template<int w, int h>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+function x265_pixel_satd_12x16_neon
+    mov             x4, x0
+    mov             x5, x2
+    eor             w7, w7, w7
+    pixel_satd_4x8_neon
+    mov             w6, v0.s[0]
+    add             w7, w7, w6
+    pixel_satd_4x8_neon
+    mov             w6, v0.s[0]
+    add             w7, w7, w6
+
+    add             x0, x4, #4
+    add             x2, x5, #4
+    pixel_satd_4x8_neon
+    mov             w6, v0.s[0]
+    add             w7, w7, w6
+    pixel_satd_4x8_neon
+    mov             w6, v0.s[0]
+    add             w7, w7, w6
+
+    add             x0, x4, #8
+    add             x2, x5, #8
+    pixel_satd_4x8_neon
+    mov             w6, v0.s[0]
+    add             w7, w7, w6
+    pixel_satd_4x8_neon
+    mov             w6, v0.s[0]
+    add             w0, w7, w6
+    ret
+endfunc
+
+// template<int w, int h>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+function x265_pixel_satd_12x32_neon
+    mov             x4, x0
+    mov             x5, x2
+    eor             w7, w7, w7
+.rept 4
+    pixel_satd_4x8_neon
+    mov             w6, v0.s[0]
+    add             w7, w7, w6
+.endr
+
+    add             x0, x4, #4
+    add             x2, x5, #4
+.rept 4
+    pixel_satd_4x8_neon
+    mov             w6, v0.s[0]
+    add             w7, w7, w6
+.endr
+
+    add             x0, x4, #8
+    add             x2, x5, #8
+.rept 4
+    pixel_satd_4x8_neon
+    mov             w6, v0.s[0]
+    add             w7, w7, w6
+.endr
+
+    mov             w0, w7
+    ret
+endfunc
+
+// template<int w, int h>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+function x265_pixel_satd_8x8_neon
+    eor             w4, w4, w4
+    mov             x6, x0
+    mov             x7, x2
+    pixel_satd_4x8_neon
+    mov             w5, v0.s[0]
+    add             w4, w4, w5
+    add             x0, x6, #4
+    add             x2, x7, #4
+    pixel_satd_4x8_neon
+    mov             w5, v0.s[0]
+    add             w0, w4, w5
+    ret
+endfunc
+
+// int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
+function x265_psyCost_4x4_neon
+    ld1r            {v4.2s}, [x0], x1
+    ld1r            {v5.2s}, [x0], x1
+    ld1             {v4.s}[1], [x0], x1
+    ld1             {v5.s}[1], [x0], x1
+
+    ld1r            {v6.2s}, [x2], x3
+    ld1r            {v7.2s}, [x2], x3
+    ld1             {v6.s}[1], [x2], x3
+    ld1             {v7.s}[1], [x2], x3
+
+    uaddl           v2.8h, v4.8b, v5.8b
+    usubl           v3.8h, v4.8b, v5.8b
+    uaddl           v18.8h, v6.8b, v7.8b
+    usubl           v19.8h, v6.8b, v7.8b
+
+    mov             v20.d[0], v2.d[1]
+    add             v0.4h, v2.4h, v20.4h
+    sub             v1.4h, v2.4h, v20.4h
+    mov             v21.d[0], v3.d[1]
+    add             v22.4h, v3.4h, v21.4h
+    sub             v23.4h, v3.4h, v21.4h
+
+    mov             v24.d[0], v18.d[1]
+    add             v16.4h, v18.4h, v24.4h
+    sub             v17.4h, v18.4h, v24.4h
+    mov             v25.d[0], v19.d[1]
+    add             v26.4h, v19.4h, v25.4h
+    sub             v27.4h, v19.4h, v25.4h
+
+    mov             v0.d[1], v22.d[0]
+    mov             v1.d[1], v23.d[0]
+    trn1            v22.8h, v0.8h, v1.8h
+    trn2            v23.8h, v0.8h, v1.8h
+    mov             v16.d[1], v26.d[0]
+    mov             v17.d[1], v27.d[0]
+    trn1            v26.8h, v16.8h, v17.8h
+    trn2            v27.8h, v16.8h, v17.8h
+
+    add             v2.8h, v22.8h, v23.8h
+    sub             v3.8h, v22.8h, v23.8h
+    add             v18.8h, v26.8h, v27.8h
+    sub             v19.8h, v26.8h, v27.8h
+
+    uaddl           v20.8h, v4.8b, v5.8b
+    uaddl           v21.8h, v6.8b, v7.8b
+
+    trn1            v0.4s, v2.4s, v3.4s
+    trn2            v1.4s, v2.4s, v3.4s
+    trn1            v16.4s, v18.4s, v19.4s
+    trn2            v17.4s, v18.4s, v19.4s
+    abs             v0.8h, v0.8h
+    abs             v16.8h, v16.8h
+    abs             v1.8h, v1.8h
+    abs             v17.8h, v17.8h
+
+    uaddlv          s20, v20.8h
+    uaddlv          s21, v21.8h
+    mov             v20.s[1], v21.s[0]
+
+    smax            v0.8h, v0.8h, v1.8h
+    smax            v16.8h, v16.8h, v17.8h
+
+    trn1            v4.2d, v0.2d, v16.2d
+    trn2            v5.2d, v0.2d, v16.2d
+    add             v0.8h, v4.8h, v5.8h
+    mov             v4.d[0], v0.d[1]
+    uaddlv          s0, v0.4h
+    uaddlv          s4, v4.4h
+
+    ushr            v20.2s, v20.2s, #2
+    mov             v0.s[1], v4.s[0]
+    sub             v0.2s, v0.2s, v20.2s
+    mov             w0, v0.s[0]
+    mov             w1, v0.s[1]
+    subs            w0, w0, w1
+    cneg            w0, w0, mi
+
+    ret
+endfunc
+
+// uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
+function x265_quant_neon
+    mov             w9, #1
+    lsl             w9, w9, w4
+    dup             v0.2s, w9
+    neg             w9, w4
+    dup             v1.4s, w9
+    add             w9, w9, #8
+    dup             v2.4s, w9
+    dup             v3.4s, w5
+
+    lsr             w6, w6, #2
+    eor             v4.16b, v4.16b, v4.16b
+    eor             w10, w10, w10
+    eor             v17.16b, v17.16b, v17.16b
+
+.loop_quant:
+
+    ld1             {v18.4h}, [x0], #8
+    ld1             {v7.4s}, [x1], #16
+    sxtl            v6.4s, v18.4h
+
+    cmlt            v5.4s, v6.4s, #0
+
+    abs             v6.4s, v6.4s
+
+
+    mul             v6.4s, v6.4s, v7.4s
+
+    add             v7.4s, v6.4s, v3.4s
+    sshl            v7.4s, v7.4s, v1.4s
+
+    mls             v6.4s, v7.4s, v0.s[0]
+    sshl            v16.4s, v6.4s, v2.4s
+    st1             {v16.4s}, [x2], #16
+
+    // numsig
+    cmeq            v16.4s, v7.4s, v17.4s
+    add             v4.4s, v4.4s, v16.4s
+    add             w10, w10, #4
+
+    // level *= sign
+    eor             v16.16b, v7.16b, v5.16b
+    sub             v16.4s, v16.4s, v5.4s
+    sqxtn           v5.4h, v16.4s
+    st1             {v5.4h}, [x3], #8
+
+    subs            w6, w6, #1
+    b.ne             .loop_quant
+
+    addv            s4, v4.4s
+    mov             w9, v4.s[0]
+    add             w0, w10, w9
+    ret
+endfunc
+
+.macro satd_4x4_neon
+    ld1             {v1.s}[0], [x2], x3
+    ld1             {v0.s}[0], [x0], x1
+    ld1             {v3.s}[0], [x2], x3
+    ld1             {v2.s}[0], [x0], x1
+
+    ld1             {v1.s}[1], [x2], x3
+    ld1             {v0.s}[1], [x0], x1
+    ld1             {v3.s}[1], [x2], x3
+    ld1             {v2.s}[1], [x0], x1
+
+    usubl           v4.8h, v0.8b, v1.8b
+    usubl           v5.8h, v2.8b, v3.8b
+
+    add             v6.8h, v4.8h, v5.8h
+    sub             v7.8h, v4.8h, v5.8h
+
+    mov             v4.d[0], v6.d[1]
+    add             v0.8h, v6.8h, v4.8h
+    sub             v2.8h, v6.8h, v4.8h
+
+    mov             v5.d[0], v7.d[1]
+    add             v1.8h, v7.8h, v5.8h
+    sub             v3.8h, v7.8h, v5.8h
+
+    trn1            v4.4h, v0.4h, v1.4h
+    trn2            v5.4h, v0.4h, v1.4h
+
+    trn1            v6.4h, v2.4h, v3.4h
+    trn2            v7.4h, v2.4h, v3.4h
+
+    add             v0.4h, v4.4h, v5.4h
+    sub             v1.4h, v4.4h, v5.4h
+
+    add             v2.4h, v6.4h, v7.4h
+    sub             v3.4h, v6.4h, v7.4h
+
+    trn1            v4.2s, v0.2s, v1.2s
+    trn2            v5.2s, v0.2s, v1.2s
+
+    trn1            v6.2s, v2.2s, v3.2s
+    trn2            v7.2s, v2.2s, v3.2s
+
+    abs             v4.4h, v4.4h
+    abs             v5.4h, v5.4h
+    abs             v6.4h, v6.4h
+    abs             v7.4h, v7.4h
+
+    smax            v1.4h, v4.4h, v5.4h
+    smax            v2.4h, v6.4h, v7.4h
+
+    add             v0.4h, v1.4h, v2.4h
+    uaddlp          v0.2s, v0.4h
+    uaddlp          v0.1d, v0.2s
+.endm
+
+// int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+function x265_pixel_satd_4x4_neon
+    satd_4x4_neon
+    umov            x0, v0.d[0]
+    ret
+endfunc
+
+// int satd_8x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+function x265_pixel_satd_8x4_neon
+    mov             x4, x0
+    mov             x5, x2
+    satd_4x4_neon
+    add             x0, x4, #4
+    add             x2, x5, #4
+    umov            x6, v0.d[0]
+    satd_4x4_neon
+    umov            x0, v0.d[0]
+    add             x0, x0, x6
+    ret
+endfunc
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/aarch64/pixel-util.h	Thu Mar 19 13:33:01 2020 +0530
@@ -0,0 +1,40 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Yimeng Su <yimeng.su@huawei.com>
+ *          Hongbin Liu <liuhongbin1@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_PIXEL_UTIL_AARCH64_H
+#define X265_PIXEL_UTIL_AARCH64_H
+
+int x265_pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+
+uint32_t x265_quant_neon(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
+int PFX(psyCost_4x4_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
+
+#endif // ifndef X265_PIXEL_UTIL_AARCH64_H
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/aarch64/pixel.h	Thu Mar 19 13:33:01 2020 +0530
@@ -0,0 +1,105 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Hongbin Liu <liuhongbin1@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_I386_PIXEL_AARCH64_H
+#define X265_I386_PIXEL_AARCH64_H
+
+void x265_pixel_avg_pp_4x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_4x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_4x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_8x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_8x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_8x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_8x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_12x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_16x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_16x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_16x12_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_16x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_16x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_16x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_24x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_32x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_32x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_32x24_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_32x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_32x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_48x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_64x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_64x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_64x48_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_64x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+
+void x265_sad_x3_4x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_4x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_4x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_8x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_8x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_8x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_8x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_12x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_16x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_16x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_16x12_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_16x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_16x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_16x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_24x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_32x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_32x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_32x24_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_32x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_32x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_48x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_64x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_64x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+
+void x265_sad_x4_4x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_4x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_4x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_8x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_8x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_8x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_8x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_12x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_16x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_16x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_16x12_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_16x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_16x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_16x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_24x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_32x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_32x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_32x24_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_32x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_32x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_48x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_64x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_64x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+
+#endif // ifndef X265_I386_PIXEL_AARCH64_H
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/aarch64/sad-a.S	Thu Mar 19 13:33:01 2020 +0530
@@ -0,0 +1,105 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Hongbin Liu <liuhongbin1@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.section .rodata
+
+.align 4
+
+.text
+
+.macro SAD_X_START_8 x
+    ld1             {v0.8b}, [x0], x9
+.if \x == 3
+    ld1             {v1.8b}, [x1], x4
+    ld1             {v2.8b}, [x2], x4
+    ld1             {v3.8b}, [x3], x4
+.elseif \x == 4
+    ld1             {v1.8b}, [x1], x5
+    ld1             {v2.8b}, [x2], x5
+    ld1             {v3.8b}, [x3], x5
+    ld1             {v4.8b}, [x4], x5
+.endif
+    uabdl           v16.8h, v0.8b, v1.8b
+    uabdl           v17.8h, v0.8b, v2.8b
+    uabdl           v18.8h, v0.8b, v3.8b
+.if \x == 4
+    uabdl           v19.8h, v0.8b, v4.8b
+.endif
+.endm
+
+.macro SAD_X_8 x
+    ld1             {v0.8b}, [x0], x9
+.if \x == 3
+    ld1             {v1.8b}, [x1], x4
+    ld1             {v2.8b}, [x2], x4
+    ld1             {v3.8b}, [x3], x4
+.elseif \x == 4
+    ld1             {v1.8b}, [x1], x5
+    ld1             {v2.8b}, [x2], x5
+    ld1             {v3.8b}, [x3], x5
+    ld1             {v4.8b}, [x4], x5
+.endif
+    uabal           v16.8h, v0.8b, v1.8b
+    uabal           v17.8h, v0.8b, v2.8b
+    uabal           v18.8h, v0.8b, v3.8b
+.if \x == 4
+    uabal           v19.8h, v0.8b, v4.8b
+.endif
+.endm
+
+.macro SAD_X_8xN x, h
+function x265_sad_x\x\()_8x\h\()_neon
+    mov             x9, #FENC_STRIDE
+    SAD_X_START_8 \x
+.rept \h - 1
+    SAD_X_8 \x
+.endr
+    uaddlv          s0, v16.8h
+    uaddlv          s1, v17.8h
+    uaddlv          s2, v18.8h
+.if \x == 4
+    uaddlv          s3, v19.8h
+.endif
+
+.if \x == 3
+    stp             s0, s1, [x5]
+    str             s2, [x5, #8]
+.elseif \x == 4
+    stp             s0, s1, [x6]
+    stp             s2, s3, [x6, #8]
+.endif
+    ret
+endfunc
+.endm
+
+SAD_X_8xN 3 4
+SAD_X_8xN 3 8
+SAD_X_8xN 3 16
+SAD_X_8xN 3 32
+
+SAD_X_8xN 4 4
+SAD_X_8xN 4 8
+SAD_X_8xN 4 16
+SAD_X_8xN 4 32