diff options
author | Andreas Sturmlechner <asturm@gentoo.org> | 2019-05-21 21:28:14 +0200 |
---|---|---|
committer | Andreas Sturmlechner <asturm@gentoo.org> | 2019-05-21 21:29:45 +0200 |
commit | dcc0c10f45442c42c2bdbf10564a8eb39196edee (patch) | |
tree | 9ffb1de5bc2a0ca040a42739da9ce938af062e3b /media-libs | |
parent | sys-kernel/gentoo-sources: amd64 stable wrt bug #685984 (diff) | |
download | gentoo-dcc0c10f45442c42c2bdbf10564a8eb39196edee.tar.gz gentoo-dcc0c10f45442c42c2bdbf10564a8eb39196edee.tar.bz2 gentoo-dcc0c10f45442c42c2bdbf10564a8eb39196edee.zip |
media-libs/zimg: EAPI-7 bump
Package-Manager: Portage-2.3.66, Repoman-2.3.12
Signed-off-by: Andreas Sturmlechner <asturm@gentoo.org>
Diffstat (limited to 'media-libs')
-rw-r--r-- | media-libs/zimg/files/zimg-2.7.5-sse2.patch | 231 | ||||
-rw-r--r-- | media-libs/zimg/zimg-2.7.5.ebuild | 35 |
2 files changed, 247 insertions, 19 deletions
diff --git a/media-libs/zimg/files/zimg-2.7.5-sse2.patch b/media-libs/zimg/files/zimg-2.7.5-sse2.patch new file mode 100644 index 000000000000..77483ab59e73 --- /dev/null +++ b/media-libs/zimg/files/zimg-2.7.5-sse2.patch @@ -0,0 +1,231 @@ +From e30112df0ca703be82ed2c852511916fc46defbd Mon Sep 17 00:00:00 2001 +From: sekrit-twc <noreply@example.com> +Date: Fri, 22 Mar 2019 18:51:14 -0700 +Subject: [PATCH] colorspace: use bfloat16 for SSE2 linear-to-gamma LUT + +On Skylake, processing 512 pixel array: + +direction cycles/sample cycles/pxvector + g->l 2.35 9.43 + l->g 2.49 9.97 +--- + .../colorspace/x86/operation_impl_avx2.cpp | 3 +- + .../colorspace/x86/operation_impl_sse2.cpp | 104 ++++++++++++++++-- + test/colorspace/x86/colorspace_sse2_test.cpp | 14 +-- + 3 files changed, 100 insertions(+), 21 deletions(-) + +diff --git a/src/zimg/colorspace/x86/operation_impl_avx2.cpp b/src/zimg/colorspace/x86/operation_impl_avx2.cpp +index bbbbf896..f0e7f792 100644 +--- a/src/zimg/colorspace/x86/operation_impl_avx2.cpp ++++ b/src/zimg/colorspace/x86/operation_impl_avx2.cpp +@@ -114,8 +114,7 @@ class ToGammaLutOperationAVX2 final : public Operation { + { + EnsureSinglePrecision x87; + +- // Allocate an extra LUT entry so that indexing can be done by multipying by a power of 2. +- for (unsigned long i = 0; i <= UINT16_MAX; ++i) { ++ for (size_t i = 0; i <= UINT16_MAX; ++i) { + uint16_t half = static_cast<uint16_t>(i); + float x = _mm_cvtss_f32(_mm_cvtph_ps(_mm_set1_epi16(half))); + m_lut[i] = func(x * prescale); +diff --git a/src/zimg/colorspace/x86/operation_impl_sse2.cpp b/src/zimg/colorspace/x86/operation_impl_sse2.cpp +index 48645031..da9d4dbc 100644 +--- a/src/zimg/colorspace/x86/operation_impl_sse2.cpp ++++ b/src/zimg/colorspace/x86/operation_impl_sse2.cpp +@@ -3,11 +3,13 @@ + #include <algorithm> + #include <cstddef> + #include <cstdint> ++#include <type_traits> + #include <vector> + #include <emmintrin.h> + #include "common/align.h" + #include "common/ccdep.h" + #include "common/make_unique.h" ++#include "common/x86/sse2_util.h" + #include "colorspace/gamma.h" + #include "colorspace/operation.h" + #include "colorspace/operation_impl.h" +@@ -20,14 +22,25 @@ namespace { + + constexpr unsigned LUT_DEPTH = 16; + +-void lut_filter_line(const float *RESTRICT lut, unsigned lut_depth, float prescale, const float *src, float *dst, unsigned left, unsigned right) ++template <class T, class U> ++T bit_cast(const U &x) noexcept ++{ ++ static_assert(sizeof(T) == sizeof(U), "object sizes must match"); ++ static_assert(std::is_pod<T>::value && std::is_pod<U>::value, "object types must be POD"); ++ ++ T ret; ++ std::copy_n(reinterpret_cast<const char *>(&x), sizeof(x), reinterpret_cast<char *>(&ret)); ++ return ret; ++} ++ ++void to_linear_lut_filter_line(const float *RESTRICT lut, unsigned lut_depth, const float *src, float *dst, unsigned left, unsigned right) + { + unsigned vec_left = ceil_n(left, 4); + unsigned vec_right = floor_n(right, 4); + + const int32_t lut_limit = static_cast<int32_t>(1) << lut_depth; + +- const __m128 scale = _mm_set_ps1(0.5f * prescale * lut_limit); ++ const __m128 scale = _mm_set_ps1(0.5f * lut_limit); + const __m128 offset = _mm_set_ps1(0.25f * lut_limit); + const __m128i limit = _mm_set1_epi16(std::min(lut_limit + INT16_MIN, static_cast<int32_t>(INT16_MAX))); + const __m128i bias_epi16 = _mm_set1_epi16(INT16_MIN); +@@ -73,16 +86,61 @@ void lut_filter_line(const float *RESTRICT lut, unsigned lut_depth, float presca + } + } + ++void to_gamma_lut_filter_line(const float *RESTRICT lut, const float *src, float *dst, unsigned left, unsigned right) ++{ ++ unsigned vec_left = ceil_n(left, 4); ++ unsigned vec_right = floor_n(right, 4); ++ ++ for (unsigned j = left; j < vec_left; ++j) { ++ __m128i x = _mm_castps_si128(_mm_load_ss(src + j)); ++ __m128i msb = _mm_srli_epi32(x, 16); ++ __m128i lsb = _mm_and_si128(_mm_srli_epi32(x, 15), _mm_set1_epi32(1)); ++ x = mm_packus_epi32(msb, lsb); ++ x = _mm_adds_epi16(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2))); ++ ++ dst[j] = lut[_mm_cvtsi128_si32(x)]; ++ } ++ for (unsigned j = vec_left; j < vec_right; j += 4) { ++ __m128i x = _mm_castps_si128(_mm_load_ps(src + j)); ++ __m128i msb = _mm_srli_epi32(x, 16); ++ __m128i lsb = _mm_and_si128(_mm_srli_epi32(x, 15), _mm_set1_epi32(1)); ++ x = mm_packus_epi32(msb, lsb); ++ x = _mm_adds_epi16(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2))); ++ ++#if SIZE_MAX >= UINT64_MAX ++ uint64_t tmp = _mm_cvtsi128_si64(x); ++ dst[j + 0] = lut[tmp & 0xFFFFU]; ++ dst[j + 1] = lut[(tmp >> 16) & 0xFFFFU]; ++ dst[j + 2] = lut[(tmp >> 32) & 0xFFFFU]; ++ dst[j + 3] = lut[tmp >> 48]; ++#else ++ uint32_t tmp0 = _mm_cvtsi128_si32(x); ++ uint32_t tmp1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(x, _MM_SHUFFLE(3, 2, 0, 1))); ++ dst[j + 0] = lut[tmp0 & 0xFFFFU]; ++ dst[j + 1] = lut[tmp0 >> 16]; ++ dst[j + 2] = lut[tmp1 & 0xFFFFU]; ++ dst[j + 3] = lut[tmp1 >> 16]; ++#endif ++ } ++ for (unsigned j = vec_right; j < right; ++j) { ++ __m128i x = _mm_castps_si128(_mm_load_ss(src + j)); ++ __m128i msb = _mm_srli_epi32(x, 16); ++ __m128i lsb = _mm_and_si128(_mm_srli_epi32(x, 15), _mm_set1_epi32(1)); ++ x = mm_packus_epi32(msb, lsb); ++ x = _mm_adds_epi16(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2))); ++ ++ dst[j] = lut[_mm_cvtsi128_si32(x)]; ++ } ++} ++ + +-class LutOperationSSE2 final : public Operation { ++class ToLinearLutOperationSSE2 final : public Operation { + std::vector<float> m_lut; + unsigned m_lut_depth; +- float m_prescale; + public: +- LutOperationSSE2(gamma_func func, unsigned lut_depth, float prescale, float postscale) : ++ ToLinearLutOperationSSE2(gamma_func func, unsigned lut_depth, float postscale) : + m_lut((1UL << lut_depth) + 1), +- m_lut_depth{ lut_depth }, +- m_prescale{ static_cast<float>(prescale) } ++ m_lut_depth{ lut_depth } + { + EnsureSinglePrecision x87; + +@@ -95,9 +153,31 @@ class LutOperationSSE2 final : public Operation { + + void process(const float * const *src, float * const *dst, unsigned left, unsigned right) const override + { +- lut_filter_line(m_lut.data(), m_lut_depth, m_prescale, src[0], dst[0], left, right); +- lut_filter_line(m_lut.data(), m_lut_depth, m_prescale, src[1], dst[1], left, right); +- lut_filter_line(m_lut.data(), m_lut_depth, m_prescale, src[2], dst[2], left, right); ++ to_linear_lut_filter_line(m_lut.data(), m_lut_depth, src[0], dst[0], left, right); ++ to_linear_lut_filter_line(m_lut.data(), m_lut_depth, src[1], dst[1], left, right); ++ to_linear_lut_filter_line(m_lut.data(), m_lut_depth, src[2], dst[2], left, right); ++ } ++}; ++ ++class ToGammaLutOperationSSE2 final : public Operation { ++ std::vector<float> m_lut; ++public: ++ ToGammaLutOperationSSE2(gamma_func func, float prescale) : ++ m_lut(static_cast<uint32_t>(UINT16_MAX) + 1) ++ { ++ EnsureSinglePrecision x87; ++ ++ for (size_t i = 0; i <= UINT16_MAX; ++i) { ++ float x = bit_cast<float>(static_cast<uint32_t>(i << 16)); ++ m_lut[i] = func(x * prescale); ++ } ++ } ++ ++ void process(const float * const *src, float * const *dst, unsigned left, unsigned right) const override ++ { ++ to_gamma_lut_filter_line(m_lut.data(), src[0], dst[0], left, right); ++ to_gamma_lut_filter_line(m_lut.data(), src[1], dst[1], left, right); ++ to_gamma_lut_filter_line(m_lut.data(), src[2], dst[2], left, right); + } + }; + +@@ -109,7 +189,7 @@ std::unique_ptr<Operation> create_gamma_operation_sse2(const TransferFunction &t + if (!params.approximate_gamma) + return nullptr; + +- return ztd::make_unique<LutOperationSSE2>(transfer.to_gamma, LUT_DEPTH, transfer.to_gamma_scale, 1.0f); ++ return ztd::make_unique<ToGammaLutOperationSSE2>(transfer.to_gamma, transfer.to_gamma_scale); + } + + std::unique_ptr<Operation> create_inverse_gamma_operation_sse2(const TransferFunction &transfer, const OperationParams ¶ms) +@@ -117,7 +197,7 @@ std::unique_ptr<Operation> create_inverse_gamma_operation_sse2(const TransferFun + if (!params.approximate_gamma) + return nullptr; + +- return ztd::make_unique<LutOperationSSE2>(transfer.to_linear, LUT_DEPTH, 1.0f, transfer.to_linear_scale); ++ return ztd::make_unique<ToLinearLutOperationSSE2>(transfer.to_linear, LUT_DEPTH, transfer.to_linear_scale); + } + + } // namespace colorspace +diff --git a/test/colorspace/x86/colorspace_sse2_test.cpp b/test/colorspace/x86/colorspace_sse2_test.cpp +index d5130868..ecaa05e7 100644 +--- a/test/colorspace/x86/colorspace_sse2_test.cpp ++++ b/test/colorspace/x86/colorspace_sse2_test.cpp +@@ -53,9 +53,9 @@ TEST(ColorspaceConversionSSE2Test, test_transfer_lut) + "162687e701627cdc17283a32c36ea711d28a953e" + }, + { +- "492587e7ed75b7e3ab868bead6ade7a4137c6ea1", +- "3b0694e9fbce61466cb5a575f300d784089b6cad", +- "b68f103f52ccafae867d664d7f27fe56ae9208af" ++ "95f2715bd0d417028bebd5c5377180fcd5b01119", ++ "76f7c88b198f1ab08167f8162c1237b54f22007a", ++ "1099c3ae187c0a9f79acb9445761b6056218c779" + }, + { + "4c0b5ffe768a7812d1ef102b4d8d52614838bc8e", +@@ -63,13 +63,13 @@ TEST(ColorspaceConversionSSE2Test, test_transfer_lut) + "85a277a80dfca2e21789cedd76aaee307dbc4562" + }, + { +- "df546ce0ad6f859499a96d2d697d896067e60e38", +- "f0041b8a008ab45f0ea1319090ac7e8be0990d92", +- "06880efb598e41f96fa79e04dbdfcccd50d6dc6f" ++ "5e35786d313e936566d9873ba7a08a8d6005b2ee", ++ "829fa88acfbbb26801871bf3cadf5cc2eb6830c9", ++ "f82fcad18a19b548d419a1952b6a7a423a684b62" + }, + }; + const double expected_tolinear_snr = 80.0; +- const double expected_togamma_snr = 40.0; ++ const double expected_togamma_snr = 60.0; + + SCOPED_TRACE("tolinear 709"); + test_case({ MatrixCoefficients::RGB, TransferCharacteristics::REC_709, ColorPrimaries::UNSPECIFIED }, diff --git a/media-libs/zimg/zimg-2.7.5.ebuild b/media-libs/zimg/zimg-2.7.5.ebuild index 1775f6b60bbb..7bac59befc36 100644 --- a/media-libs/zimg/zimg-2.7.5.ebuild +++ b/media-libs/zimg/zimg-2.7.5.ebuild @@ -1,36 +1,33 @@ # Copyright 1999-2019 Gentoo Authors # Distributed under the terms of the GNU General Public License v2 -EAPI=5 +EAPI=7 -AUTOTOOLS_AUTORECONF=yes - -SCM="" - -if [ "${PV#9999}" != "${PV}" ] ; then - SCM="git-r3" +if [[ ${PV} = *9999* ]] ; then EGIT_REPO_URI="https://github.com/sekrit-twc/zimg" -fi - -inherit autotools-multilib ${SCM} - -DESCRIPTION="Scaling, colorspace conversion, and dithering library" -HOMEPAGE="https://github.com/sekrit-twc/zimg" - -if [ "${PV#9999}" = "${PV}" ] ; then + inherit git-r3 +else SRC_URI="https://github.com/sekrit-twc/zimg/archive/release-${PV}.tar.gz -> ${P}.tar.gz" KEYWORDS="~alpha amd64 ~arm arm64 ~hppa ~ia64 ~ppc ~ppc64 ~sparc ~x86" S="${WORKDIR}/${PN}-release-${PV}/" fi +inherit autotools multilib-minimal + +DESCRIPTION="Scaling, colorspace conversion, and dithering library" +HOMEPAGE="https://github.com/sekrit-twc/zimg" LICENSE="WTFPL-2" SLOT="0" IUSE="static-libs cpu_flags_x86_sse" -DEPEND="" -RDEPEND="${DEPEND}" +PATCHES=( "${FILESDIR}/${P}-sse2.patch" ) + +src_prepare() { + default + eautoreconf +} -src_configure() { - autotools-multilib_src_configure \ +multilib_src_configure() { + ECONF_SOURCE="${S}" econf \ $(use_enable cpu_flags_x86_sse x86simd) } |