/usr/share/movit/deinterlace_effect.comp is in libmovit8 1.6.1-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 | // Implicit uniforms:
// uniform int PREFIX(current_field_position);
// uniform float PREFIX(inv_width);
// uniform float PREFIX(inv_height);
// uniform float PREFIX(current_field_vertical_offset);
// Compute shader implementation of DeinterlaceEffect. See the fragment
// shader implementation (deinterlace_effect.frag) for comments about the
// algorithm; comments here will mainly be about issues specific to the
// compute shader implementation.
#define DIFF(s1, s2) dot((s1) - (s2), (s1) - (s2))
// In input pixels (so output will be 8x32). Corresponds to get_compute_dimensions()
// in the C++ code. It is illogical that 8x32 would be better than e.g. 32x8,
// since we reuse more data horizontally, but especially Intel cards are much more
// happy about this for whatever reason.
#define GROUP_W 8
#define GROUP_H 16
// When sampling from the current field (spatial interpolation below), we have
// a fringe of three pixels on the left and right sides, so we need to load
// more. We also have one pixel above and below, although our destination pixel
// is squeezed in the middle of them (they don't overlap), so we only need one
// extra pixel.
#define GROUP_W_FRINGE (GROUP_W + 6)
#define GROUP_H_FRINGE (GROUP_H + 1)
layout(local_size_x = GROUP_W, local_size_y = GROUP_H) in;
#if (GROUP_W_FRINGE * GROUP_H_FRINGE) > (GROUP_W * (GROUP_H + 2))
#define TEMP_NUM_ELEM (GROUP_W_FRINGE * GROUP_H_FRINGE)
#else
#define TEMP_NUM_ELEM (GROUP_W * (GROUP_H + 2))
#endif
shared vec4 temp[TEMP_NUM_ELEM];
#if TEMP_NUM_ELEM > (GROUP_W * GROUP_H * 2)
#error Not enough threads to load all data in two loads
#endif
// Load a WxH block of samples. We need to do this in two phases,
// since we have more input samples than we have output samples (threads);
// in the second phase, some threads will be idle.
#define LOAD_PIXEL_BLOCK(base_tc, block_width, block_height, func) \
{ \
memoryBarrierShared(); \
barrier(); \
int thread_id = int(gl_LocalInvocationID.y) * GROUP_W + int(gl_LocalInvocationID.x); \
{ \
int x = thread_id % (block_width); \
int y = thread_id / (block_width); \
temp[thread_id] = func(vec2((base_tc).x + x * PREFIX(inv_width), \
(base_tc).y + y * PREFIX(inv_height))); \
} \
const int num_threads = GROUP_W * GROUP_H; \
if (thread_id + num_threads < (block_width) * (block_height)) { \
int x = (thread_id + num_threads) % (block_width); \
int y = (thread_id + num_threads) / (block_width); \
temp[thread_id + num_threads] = \
func(vec2((base_tc).x + x * PREFIX(inv_width), \
(base_tc).y + y * PREFIX(inv_height))); \
} \
memoryBarrierShared(); \
barrier(); \
}
void FUNCNAME() {
// The current thread is responsible for output of two pixels, namely (x,2y)
// and (x,2y+1). One will be an unmodified one, the other one will be the
// pixel we are trying to interpolate. If TFF (current_field_position==0),
// the unmodified one is 2y+1 (remember OpenGL's bottom-left convention),
// and if BFF, the unmodified one is 2y. So we need to invert current_field_position
// to figure out which value to add.
int yi = int(gl_GlobalInvocationID.y) * 2 + (PREFIX(current_field_position) ^ 1);
// Load in data for the current field. current_offset signals where the block
// starts vertically; see set_gl_state() in the C++ code.
vec2 base_tc = vec2((gl_WorkGroupID.x * uint(GROUP_W) + (0.5f - 3.0f)) * PREFIX(inv_width),
(gl_WorkGroupID.y * uint(GROUP_H) + 0.5f) * PREFIX(inv_height) + PREFIX(current_field_vertical_offset));
LOAD_PIXEL_BLOCK(base_tc, GROUP_W_FRINGE, GROUP_H_FRINGE, INPUT3);
int lx = int(gl_LocalInvocationID.x) + 3;
int ly = int(gl_LocalInvocationID.y);
// Output the unmodified pixel. For TFF (current_field_position == 0),
// we have an extra pixel on the bottom that we're only using for interpolation
// (it's being output by another workgroup), so we have to add 1.
vec4 val = temp[(ly + (PREFIX(current_field_position) ^ 1)) * GROUP_W_FRINGE + lx];
OUTPUT(ivec2(gl_GlobalInvocationID.x, yi), val);
// a b c d e f g ↑ y
// x |
// h i j k l m n +--> x
vec4 a = temp[(ly + 1) * GROUP_W_FRINGE + lx - 3];
vec4 b = temp[(ly + 1) * GROUP_W_FRINGE + lx - 2];
vec4 c = temp[(ly + 1) * GROUP_W_FRINGE + lx - 1];
vec4 d = temp[(ly + 1) * GROUP_W_FRINGE + lx];
vec4 e = temp[(ly + 1) * GROUP_W_FRINGE + lx + 1];
vec4 f = temp[(ly + 1) * GROUP_W_FRINGE + lx + 2];
vec4 g = temp[(ly + 1) * GROUP_W_FRINGE + lx + 3];
vec4 h = temp[ly * GROUP_W_FRINGE + lx - 3];
vec4 i = temp[ly * GROUP_W_FRINGE + lx - 2];
vec4 j = temp[ly * GROUP_W_FRINGE + lx - 1];
vec4 k = temp[ly * GROUP_W_FRINGE + lx];
vec4 l = temp[ly * GROUP_W_FRINGE + lx + 1];
vec4 m = temp[ly * GROUP_W_FRINGE + lx + 2];
vec4 n = temp[ly * GROUP_W_FRINGE + lx + 3];
// 0 degrees.
vec4 pred = d + k;
float score;
float best_score = DIFF(c, j) + DIFF(d, k) + DIFF(e, l) - 1e-4;
// -45 degrees.
score = DIFF(b, k) + DIFF(c, l) + DIFF(d, m);
if (score < best_score) {
pred = c + l;
best_score = score;
}
// -63 degrees.
score = DIFF(a, l) + DIFF(b, m) + DIFF(c, n);
if (score < best_score) {
pred = b + m;
best_score = score;
}
// +45 degrees.
score = DIFF(d, i) + DIFF(e, j) + DIFF(f, k);
if (score < best_score) {
pred = e + j;
best_score = score;
}
// +63 degrees.
score = DIFF(e, h) + DIFF(f, i) + DIFF(g, j);
if (score < best_score) {
pred = f + i;
// best_score isn't used anymore.
}
pred *= 0.5f;
// Temporal prediction (p2) of this pixel based on the previous and next fields.
//
// ↑ y
// C H |
// A F K |
// D x I |
// B G L |
// E J |
// +-----> time
//
// x is obviously aligned with D and I, so we don't need texcoord
// adjustment for top/bottom field here, unlike earlier. However, we need
// to start the block one pixel below since we need E/J, thus the -1 in
// the y coordinate.
base_tc = vec2((gl_WorkGroupID.x * uint(GROUP_W) + 0.5f) * PREFIX(inv_width),
(gl_WorkGroupID.y * uint(GROUP_H) + (0.5f - 1.0f)) * PREFIX(inv_height));
lx = int(gl_LocalInvocationID.x);
#if YADIF_ENABLE_SPATIAL_INTERLACING_CHECK
LOAD_PIXEL_BLOCK(base_tc, GROUP_W, GROUP_H + 2, INPUT2);
vec4 C = temp[(ly + 2) * GROUP_W + lx];
vec4 D = temp[(ly + 1) * GROUP_W + lx];
vec4 E = temp[ ly * GROUP_W + lx];
LOAD_PIXEL_BLOCK(base_tc, GROUP_W, GROUP_H + 2, INPUT4);
vec4 H = temp[(ly + 2) * GROUP_W + lx];
vec4 I = temp[(ly + 1) * GROUP_W + lx];
vec4 J = temp[ ly * GROUP_W + lx];
#else
// Since spatial interlacing check is not enabled, we only need D
// and I from the previous and next fields; since they are not shared
// between the neighboring pixels, they can be straight-up loads.
vec2 DI_pos = vec2((gl_GlobalInvocationID.x + 0.5f) * PREFIX(inv_width),
(gl_GlobalInvocationID.y + 0.5f) * PREFIX(inv_height));
vec4 D = INPUT2(DI_pos);
vec4 I = INPUT4(DI_pos);
#endif
// Load what we need from the previous field into shared memory,
// since A/B can be reused between neighboring pixels. We need one
// line above/below, but we don't need the horizontal fringe.
LOAD_PIXEL_BLOCK(base_tc, GROUP_W, GROUP_H + 1, INPUT1);
vec4 A = temp[(ly + 1) * GROUP_W + lx];
vec4 B = temp[ ly * GROUP_W + lx];
// What we need from the current field was loaded earlier.
vec4 F = d;
vec4 G = k;
// Next field.
LOAD_PIXEL_BLOCK(base_tc, GROUP_W, GROUP_H + 1, INPUT5);
vec4 K = temp[(ly + 1) * GROUP_W + lx];
vec4 L = temp[ ly * GROUP_W + lx];
// Find temporal differences around this line.
vec4 tdiff0 = abs(D - I);
vec4 tdiff1 = abs(A - F) + abs(B - G); // Actually twice tdiff1.
vec4 tdiff2 = abs(K - F) + abs(L - G); // Actually twice tdiff2.
vec4 diff = max(tdiff0, 0.5f * max(tdiff1, tdiff2));
#if YADIF_ENABLE_SPATIAL_INTERLACING_CHECK
// Spatial interlacing check.
// We start by temporally interpolating the current vertical line (p0–p4):
//
// C p0 H ↑ y
// p1 |
// D p2 I |
// p3 |
// E p4 J +-----> time
//
vec4 p0 = 0.5f * (C + H);
vec4 p1 = F;
vec4 p2 = 0.5f * (D + I);
vec4 p3 = G;
vec4 p4 = 0.5f * (E + J);
vec4 max_ = max(max(p2 - p3, p2 - p1), min(p0 - p1, p4 - p3));
vec4 min_ = min(min(p2 - p3, p2 - p1), max(p0 - p1, p4 - p3));
diff = max(diff, max(min_, -max_));
#else
vec4 p2 = 0.5f * (D + I);
#endif
val = clamp(pred, p2 - diff, p2 + diff);
OUTPUT(ivec2(gl_GlobalInvocationID.x, yi ^ 1), val);
}
#undef LOAD_PIXEL_BLOCK
#undef DIFF
#undef YADIF_ENABLE_SPATIAL_INTERLACING_CHECK
|