WindowServer: Optimize rendering

We now use SSE2 to do alpha blending on 4 pixels at a time where
possible and use memcpy instead of manual loops for non blended regions.
This commit is contained in:
2026-04-06 17:35:36 +03:00
parent 1487c86262
commit 2a4a688c2d
2 changed files with 144 additions and 51 deletions

View File

@@ -14,15 +14,15 @@
#include <unistd.h> #include <unistd.h>
#include <emmintrin.h>
WindowServer::WindowServer(Framebuffer& framebuffer, int32_t corner_radius) WindowServer::WindowServer(Framebuffer& framebuffer, int32_t corner_radius)
: m_framebuffer(framebuffer) : m_framebuffer(framebuffer)
, m_corner_radius(corner_radius) , m_corner_radius(corner_radius)
, m_cursor({ framebuffer.width / 2, framebuffer.height / 2 }) , m_cursor({ framebuffer.width / 2, framebuffer.height / 2 })
, m_font(MUST(LibFont::Font::load("/usr/share/fonts/lat0-16.psfu"_sv))) , m_font(MUST(LibFont::Font::load("/usr/share/fonts/lat0-16.psfu"_sv)))
{ {
BAN::Vector<LibImage::Image::Color> bitmap; MUST(m_background_image.resize(m_framebuffer.width * m_framebuffer.height, 0xFF101010));
MUST(bitmap.resize(m_framebuffer.width * m_framebuffer.height, { 0x10, 0x10, 0x10, 0xFF }));
m_background_image = MUST(BAN::UniqPtr<LibImage::Image>::create(m_framebuffer.width, m_framebuffer.height, BAN::move(bitmap)));
MUST(m_pending_syncs.resize(m_framebuffer.height)); MUST(m_pending_syncs.resize(m_framebuffer.height));
@@ -33,7 +33,11 @@ BAN::ErrorOr<void> WindowServer::set_background_image(BAN::UniqPtr<LibImage::Ima
{ {
if (image->width() != (uint64_t)m_framebuffer.width || image->height() != (uint64_t)m_framebuffer.height) if (image->width() != (uint64_t)m_framebuffer.width || image->height() != (uint64_t)m_framebuffer.height)
image = TRY(image->resize(m_framebuffer.width, m_framebuffer.height)); image = TRY(image->resize(m_framebuffer.width, m_framebuffer.height));
m_background_image = BAN::move(image);
for (int32_t y = 0; y < m_framebuffer.height; y++)
for (int32_t x = 0; x < m_framebuffer.width; x++)
m_background_image[y * m_framebuffer.width + x] = image->get_color(x, y).as_argb();
invalidate(m_framebuffer.area()); invalidate(m_framebuffer.area());
return {}; return {};
} }
@@ -855,8 +859,7 @@ void WindowServer::set_focused_window(BAN::RefPtr<Window> window)
static uint32_t alpha_blend(uint32_t color_a, uint32_t color_b) static uint32_t alpha_blend(uint32_t color_a, uint32_t color_b)
{ {
const uint32_t a_a = color_a >> 24; const uint32_t a_a = color_a >> 24;
const uint32_t a_b = ((color_b >> 24) * (256 - a_a)) >> 8; const uint32_t a_b = 0xFF - a_a;
const uint32_t a = a_a + a_b;
const uint32_t rb1 = (a_a * (color_a & 0xFF00FF)) >> 8; const uint32_t rb1 = (a_a * (color_a & 0xFF00FF)) >> 8;
const uint32_t rb2 = (a_b * (color_b & 0xFF00FF)) >> 8; const uint32_t rb2 = (a_b * (color_b & 0xFF00FF)) >> 8;
@@ -864,14 +867,51 @@ static uint32_t alpha_blend(uint32_t color_a, uint32_t color_b)
const uint32_t g1 = (a_a * (color_a & 0x00FF00)) >> 8; const uint32_t g1 = (a_a * (color_a & 0x00FF00)) >> 8;
const uint32_t g2 = (a_b * (color_b & 0x00FF00)) >> 8; const uint32_t g2 = (a_b * (color_b & 0x00FF00)) >> 8;
return (a << 24) | ((rb1 | rb2) & 0xFF00FF) | ((g1 | g2) & 0x00FF00); const uint32_t a = a_a + (((color_b >> 24) * a_b) >> 8);
return (a << 24) | ((rb1 + rb2) & 0xFF00FF) | ((g1 + g2) & 0x00FF00);
}
static void alpha_blend4(const uint32_t color_a[4], const uint32_t color_b[4], uint32_t color_out[4])
{
// load colors
const __m128i ca = _mm_loadu_si128((const __m128i*)color_a);
const __m128i cb = _mm_loadu_si128((const __m128i*)color_b);
// unpack colors to 16 bit words
const __m128i zero = _mm_setzero_si128();
const __m128i ca_lo = _mm_unpacklo_epi8(ca, zero);
const __m128i ca_hi = _mm_unpackhi_epi8(ca, zero);
const __m128i cb_lo = _mm_unpacklo_epi8(cb, zero);
const __m128i cb_hi = _mm_unpackhi_epi8(cb, zero);
// extract alpha channel from color_a
const __m128i a1_lo = _mm_shufflehi_epi16(_mm_shufflelo_epi16(ca_lo, 0b11'11'11'11), 0b11'11'11'11);
const __m128i a1_hi = _mm_shufflehi_epi16(_mm_shufflelo_epi16(ca_hi, 0b11'11'11'11), 0b11'11'11'11);
// calculate inverse alpha
const __m128i low_byte16 = _mm_set1_epi16(0xFF);
const __m128i a2_lo = _mm_sub_epi16(low_byte16, a1_lo);
const __m128i a2_hi = _mm_sub_epi16(low_byte16, a1_hi);
// blend and pack rgb (a*c1 + c2*(255-a)) / 256
const __m128i rgb_lo = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(ca_lo, a1_lo), _mm_mullo_epi16(cb_lo, a2_lo)), 8);
const __m128i rgb_hi = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(ca_hi, a1_hi), _mm_mullo_epi16(cb_hi, a2_hi)), 8);
const __m128i rgb = _mm_and_si128(_mm_set1_epi32(0x00FFFFFF), _mm_packus_epi16(rgb_lo, rgb_hi));
// extract alpha channel from color_b
const __m128i ab_lo = _mm_shufflehi_epi16(_mm_shufflelo_epi16(cb_lo, 0b11'11'11'11), 0b11'11'11'11);
const __m128i ab_hi = _mm_shufflehi_epi16(_mm_shufflelo_epi16(cb_hi, 0b11'11'11'11), 0b11'11'11'11);
// blend and pack alpha (a + ab*(255-a)) / 256
const __m128i alpha_lo = _mm_add_epi16(a1_lo, _mm_srli_epi16(_mm_mullo_epi16(ab_lo, a2_lo), 8));
const __m128i alpha_hi = _mm_add_epi16(a1_hi, _mm_srli_epi16(_mm_mullo_epi16(ab_hi, a2_hi), 8));
const __m128i alpha = _mm_slli_epi32(_mm_packus_epi16(alpha_lo, alpha_hi), 24);
_mm_storeu_si128((__m128i*)color_out, _mm_or_si128(alpha, rgb));
} }
void WindowServer::invalidate(Rectangle area) void WindowServer::invalidate(Rectangle area)
{ {
ASSERT(m_background_image->width() == (uint64_t)m_framebuffer.width);
ASSERT(m_background_image->height() == (uint64_t)m_framebuffer.height);
const Window::Cursor* window_cursor = nullptr; const Window::Cursor* window_cursor = nullptr;
if (auto window = this->find_hovered_window(); window && window->has_cursor()) if (auto window = this->find_hovered_window(); window && window->has_cursor())
window_cursor = &window->cursor(); window_cursor = &window->cursor();
@@ -919,24 +959,46 @@ void WindowServer::invalidate(Rectangle area)
if (client_area == m_framebuffer.area()) if (client_area == m_framebuffer.area())
{ {
const uint32_t* client_ptr = m_focused_window->framebuffer();
const size_t client_width = m_focused_window->client_width();
if (!should_alpha_blend) if (!should_alpha_blend)
{ {
for (int32_t y = area.y; y < area.y + area.height; y++) for (int32_t y = area.y; y < area.y + area.height; y++)
for (int32_t x = area.x; x < area.x + area.width; x++) {
m_framebuffer.mmap[y * m_framebuffer.width + x] = m_focused_window->framebuffer()[y * m_focused_window->client_width() + x]; memcpy(
&m_framebuffer.mmap[y * m_framebuffer.width + area.x],
&client_ptr[y * client_width + area.x],
area.width * sizeof(uint32_t)
);
}
} }
else else
{ {
for (int32_t y = area.y; y < area.y + area.height; y++) for (int32_t y = area.y; y < area.y + area.height; y++)
{ {
for (int32_t x = area.x; x < area.x + area.width; x++) const uint32_t* window_row = &client_ptr[y * client_width + area.x];
const uint32_t* image_row = &m_background_image[y * m_framebuffer.width + area.x];
uint32_t* frameb_row = &m_framebuffer.mmap[y * m_framebuffer.width + area.x];
int32_t pixels = area.width;
for (; pixels >= 4; pixels -= 4)
{ {
const uint32_t src_pixel = m_focused_window->framebuffer()[y * m_focused_window->client_width() + x]; alpha_blend4(window_row, image_row, frameb_row);
const uint32_t bg_pixel = m_background_image->get_color(x, y).as_argb(); frameb_row += 4;
m_framebuffer.mmap[y * m_framebuffer.width + x] = alpha_blend(src_pixel, bg_pixel); window_row += 4;
image_row += 4;
}
for (; pixels > 0; pixels--)
{
*frameb_row = alpha_blend(*window_row, *image_row);
frameb_row++;
window_row++;
image_row++;
} }
} }
} }
mark_pending_sync(area); mark_pending_sync(area);
} }
else else
@@ -949,8 +1011,8 @@ void WindowServer::invalidate(Rectangle area)
}.get_overlap(m_framebuffer.area()); }.get_overlap(m_framebuffer.area());
if (!opt_dst_area.has_value()) if (!opt_dst_area.has_value())
return; return;
const auto dst_area = opt_dst_area.release_value();
const auto dst_area = opt_dst_area.release_value();
for (int32_t dst_y = dst_area.y; dst_y < dst_area.y + dst_area.height; dst_y++) for (int32_t dst_y = dst_area.y; dst_y < dst_area.y + dst_area.height; dst_y++)
{ {
for (int32_t dst_x = dst_area.x; dst_x < dst_area.x + dst_area.width; dst_x++) for (int32_t dst_x = dst_area.x; dst_x < dst_area.x + dst_area.width; dst_x++)
@@ -959,7 +1021,7 @@ void WindowServer::invalidate(Rectangle area)
const int32_t src_y = BAN::Math::clamp<int32_t>(dst_y * m_focused_window->client_height() / m_framebuffer.height, 0, m_focused_window->client_height()); const int32_t src_y = BAN::Math::clamp<int32_t>(dst_y * m_focused_window->client_height() / m_framebuffer.height, 0, m_focused_window->client_height());
const uint32_t src_pixel = m_focused_window->framebuffer()[src_y * m_focused_window->client_width() + src_x]; const uint32_t src_pixel = m_focused_window->framebuffer()[src_y * m_focused_window->client_width() + src_x];
const uint32_t bg_pixel = m_background_image->get_color(dst_x, dst_y).as_argb(); const uint32_t bg_pixel = m_background_image[dst_y * m_framebuffer.width + dst_x];
uint32_t& dst_pixel = m_framebuffer.mmap[dst_y * m_framebuffer.width + dst_x]; uint32_t& dst_pixel = m_framebuffer.mmap[dst_y * m_framebuffer.width + dst_x];
dst_pixel = should_alpha_blend ? alpha_blend(src_pixel, bg_pixel) : src_pixel; dst_pixel = should_alpha_blend ? alpha_blend(src_pixel, bg_pixel) : src_pixel;
@@ -1013,8 +1075,13 @@ void WindowServer::invalidate(Rectangle area)
area = fb_overlap.release_value(); area = fb_overlap.release_value();
for (int32_t y = area.y; y < area.y + area.height; y++) for (int32_t y = area.y; y < area.y + area.height; y++)
for (int32_t x = area.x; x < area.x + area.width; x++) {
m_framebuffer.mmap[y * m_framebuffer.width + x] = m_background_image->get_color(x, y).as_argb(); memcpy(
&m_framebuffer.mmap[y * m_framebuffer.width + area.x],
&m_background_image[y * m_framebuffer.width + area.x],
area.width * sizeof(uint32_t)
);
}
// FIXME: this loop should be inverse order and terminate // FIXME: this loop should be inverse order and terminate
// after window without alpha channel is found // after window without alpha channel is found
@@ -1122,8 +1189,8 @@ void WindowServer::invalidate(Rectangle area)
if (is_rounded_off(window, { abs_x, abs_y })) if (is_rounded_off(window, { abs_x, abs_y }))
continue; continue;
const uint32_t color = window.title_bar_pixel(abs_x, abs_y, m_cursor); m_framebuffer.mmap[abs_y * m_framebuffer.width + abs_x] =
m_framebuffer.mmap[abs_y * m_framebuffer.width + abs_x] = color; window.title_bar_pixel(abs_x, abs_y, m_cursor);
} }
} }
} }
@@ -1131,15 +1198,19 @@ void WindowServer::invalidate(Rectangle area)
// window client area // window client area
if (auto client_overlap = window.client_area().get_overlap(area); client_overlap.has_value()) if (auto client_overlap = window.client_area().get_overlap(area); client_overlap.has_value())
{ {
const bool should_alpha_blend = window.get_attributes().alpha_channel;
for (const auto& fast_area : fast_areas) for (const auto& fast_area : fast_areas)
{ {
auto fast_overlap = client_overlap->get_overlap(fast_area); auto opt_fast_overlap = client_overlap->get_overlap(fast_area);
if (!fast_overlap.has_value()) if (!opt_fast_overlap.has_value())
continue; continue;
for (int32_t y_off = 0; y_off < fast_overlap->height; y_off++)
const auto fast_overlap = opt_fast_overlap.release_value();
for (int32_t y_off = 0; y_off < fast_overlap.height; y_off++)
{ {
const int32_t abs_row_y = fast_overlap->y + y_off; const int32_t abs_row_y = fast_overlap.y + y_off;
const int32_t abs_row_x = fast_overlap->x; const int32_t abs_row_x = fast_overlap.x;
const int32_t src_row_y = abs_row_y - window.client_y(); const int32_t src_row_y = abs_row_y - window.client_y();
const int32_t src_row_x = abs_row_x - window.client_x(); const int32_t src_row_x = abs_row_x - window.client_x();
@@ -1147,31 +1218,40 @@ void WindowServer::invalidate(Rectangle area)
auto* window_row = &window.framebuffer()[src_row_y * window.client_width() + src_row_x]; auto* window_row = &window.framebuffer()[src_row_y * window.client_width() + src_row_x];
auto* frameb_row = &m_framebuffer.mmap[abs_row_y * m_framebuffer.width + abs_row_x]; auto* frameb_row = &m_framebuffer.mmap[abs_row_y * m_framebuffer.width + abs_row_x];
const bool should_alpha_blend = window.get_attributes().alpha_channel; if (!should_alpha_blend)
for (int32_t i = 0; i < fast_overlap->width; i++) memcpy(frameb_row, window_row, fast_overlap.width * sizeof(uint32_t));
else
{ {
const uint32_t color_a = *window_row; int32_t pixels = fast_overlap.width;
const uint32_t color_b = *frameb_row; for (; pixels >= 4; pixels -= 4)
*frameb_row = should_alpha_blend {
? alpha_blend(color_a, color_b) alpha_blend4(window_row, frameb_row, frameb_row);
: color_a; window_row += 4;
frameb_row += 4;
}
for (; pixels; pixels--)
{
*frameb_row = alpha_blend(*window_row, *frameb_row);
window_row++; window_row++;
frameb_row++; frameb_row++;
} }
} }
} }
}
for (const auto& corner_area : corner_areas) for (const auto& corner_area : corner_areas)
{ {
auto corner_overlap = client_overlap->get_overlap(corner_area); auto opt_corner_overlap = client_overlap->get_overlap(corner_area);
if (!corner_overlap.has_value()) if (!opt_corner_overlap.has_value())
continue; continue;
for (int32_t y_off = 0; y_off < corner_overlap->height; y_off++)
const auto corner_overlap = opt_corner_overlap.release_value();
for (int32_t y_off = 0; y_off < corner_overlap.height; y_off++)
{ {
for (int32_t x_off = 0; x_off < corner_overlap->width; x_off++) for (int32_t x_off = 0; x_off < corner_overlap.width; x_off++)
{ {
const int32_t abs_x = corner_overlap->x + x_off; const int32_t abs_x = corner_overlap.x + x_off;
const int32_t abs_y = corner_overlap->y + y_off; const int32_t abs_y = corner_overlap.y + y_off;
if (is_rounded_off(window, { abs_x, abs_y })) if (is_rounded_off(window, { abs_x, abs_y }))
continue; continue;
@@ -1181,7 +1261,7 @@ void WindowServer::invalidate(Rectangle area)
const uint32_t color_a = window.framebuffer()[src_y * window.client_width() + src_x]; const uint32_t color_a = window.framebuffer()[src_y * window.client_width() + src_x];
const uint32_t color_b = m_framebuffer.mmap[abs_y * m_framebuffer.width + abs_x]; const uint32_t color_b = m_framebuffer.mmap[abs_y * m_framebuffer.width + abs_x];
const bool should_alpha_blend = window.get_attributes().alpha_channel; // NOTE: corners are small so we do them one pixel at a time to keep the code simple
m_framebuffer.mmap[abs_y * m_framebuffer.width + abs_x] = should_alpha_blend m_framebuffer.mmap[abs_y * m_framebuffer.width + abs_x] = should_alpha_blend
? alpha_blend(color_a, color_b) ? alpha_blend(color_a, color_b)
: color_a; : color_a;
@@ -1193,14 +1273,26 @@ void WindowServer::invalidate(Rectangle area)
if (m_state == State::Resizing) if (m_state == State::Resizing)
{ {
if (const auto overlap = resize_area(m_cursor).get_overlap(area); overlap.has_value()) if (auto opt_overlap = resize_area(m_cursor).get_overlap(area); opt_overlap.has_value())
{ {
for (int32_t y_off = 0; y_off < overlap->height; y_off++) constexpr uint32_t blend_color = 0x80000000;
const auto overlap = opt_overlap.release_value();
for (int32_t y = overlap.y; y < overlap.y + overlap.height; y++)
{ {
for (int32_t x_off = 0; x_off < overlap->width; x_off++) uint32_t* frameb_row = &m_framebuffer.mmap[y * m_framebuffer.width + overlap.x];
int32_t pixels = overlap.width;
for (; pixels >= 4; pixels -= 4)
{ {
auto& pixel = m_framebuffer.mmap[(overlap->y + y_off) * m_framebuffer.width + (overlap->x + x_off)]; const uint32_t blend_colors[] { blend_color, blend_color, blend_color, blend_color };
pixel = alpha_blend(0x80000000, pixel); alpha_blend4(blend_colors, frameb_row, frameb_row);
frameb_row += 4;
}
for (; pixels > 0; pixels--)
{
*frameb_row = alpha_blend(blend_color, *frameb_row);
frameb_row++;
} }
} }
} }

View File

@@ -96,7 +96,8 @@ private:
BAN::Vector<RangeList> m_pending_syncs; BAN::Vector<RangeList> m_pending_syncs;
BAN::UniqPtr<LibImage::Image> m_background_image; // NOTE: same size as framebuffer
BAN::Vector<uint32_t> m_background_image;
State m_state { State::Normal }; State m_state { State::Normal };
bool m_is_mod_key_held { false }; bool m_is_mod_key_held { false };