/usr/share/movit/deinterlace

// Implicit uniforms:
// uniform int PREFIX(current_field_position);
// uniform float PREFIX(inv_width);
// uniform float PREFIX(inv_height);
// uniform float PREFIX(current_field_vertical_offset);

// Compute shader implementation of DeinterlaceEffect. See the fragment
// shader implementation (deinterlace_effect.frag) for comments about the
// algorithm; comments here will mainly be about issues specific to the
// compute shader implementation.

#define DIFF(s1, s2) dot((s1) - (s2), (s1) - (s2))

// In input pixels (so output will be 8x32). Corresponds to get_compute_dimensions()
// in the C++ code. It is illogical that 8x32 would be better than e.g. 32x8,
// since we reuse more data horizontally, but especially Intel cards are much more
// happy about this for whatever reason.
#define GROUP_W 8
#define GROUP_H 16

// When sampling from the current field (spatial interpolation below), we have
// a fringe of three pixels on the left and right sides, so we need to load
// more. We also have one pixel above and below, although our destination pixel
// is squeezed in the middle of them (they don't overlap), so we only need one
// extra pixel.
#define GROUP_W_FRINGE (GROUP_W + 6)
#define GROUP_H_FRINGE (GROUP_H + 1)

layout(local_size_x = GROUP_W, local_size_y = GROUP_H) in;

#if (GROUP_W_FRINGE * GROUP_H_FRINGE) > (GROUP_W * (GROUP_H + 2))
#define TEMP_NUM_ELEM (GROUP_W_FRINGE * GROUP_H_FRINGE)
#else
#define TEMP_NUM_ELEM (GROUP_W * (GROUP_H + 2))
#endif

shared vec4 temp[TEMP_NUM_ELEM];

#if TEMP_NUM_ELEM > (GROUP_W * GROUP_H * 2)
#error Not enough threads to load all data in two loads
#endif

// Load a WxH block of samples. We need to do this in two phases,
// since we have more input samples than we have output samples (threads);
// in the second phase, some threads will be idle.
#define LOAD_PIXEL_BLOCK(base_tc, block_width, block_height, func) \
{ \
	memoryBarrierShared(); \
	barrier(); \
	int thread_id = int(gl_LocalInvocationID.y) * GROUP_W + int(gl_LocalInvocationID.x); \
	{ \
		int x = thread_id % (block_width); \
		int y = thread_id / (block_width); \
		temp[thread_id] = func(vec2((base_tc).x + x * PREFIX(inv_width), \
		                            (base_tc).y + y * PREFIX(inv_height))); \
	} \
	const int num_threads = GROUP_W * GROUP_H; \
	if (thread_id + num_threads < (block_width) * (block_height)) { \
		int x = (thread_id + num_threads) % (block_width); \
		int y = (thread_id + num_threads) / (block_width); \
		temp[thread_id + num_threads] = \
			func(vec2((base_tc).x + x * PREFIX(inv_width), \
		                  (base_tc).y + y * PREFIX(inv_height))); \
	} \
	memoryBarrierShared(); \
	barrier(); \
}

void FUNCNAME() {
	// The current thread is responsible for output of two pixels, namely (x,2y)
	// and (x,2y+1). One will be an unmodified one, the other one will be the
	// pixel we are trying to interpolate. If TFF (current_field_position==0),
	// the unmodified one is 2y+1 (remember OpenGL's bottom-left convention),
	// and if BFF, the unmodified one is 2y. So we need to invert current_field_position
	// to figure out which value to add.
	int yi = int(gl_GlobalInvocationID.y) * 2 + (PREFIX(current_field_position) ^ 1);

	// Load in data for the current field. current_offset signals where the block
	// starts vertically; see set_gl_state() in the C++ code.
	vec2 base_tc = vec2((gl_WorkGroupID.x * uint(GROUP_W) + (0.5f - 3.0f)) * PREFIX(inv_width),
	                    (gl_WorkGroupID.y * uint(GROUP_H) + 0.5f) * PREFIX(inv_height) + PREFIX(current_field_vertical_offset));
	LOAD_PIXEL_BLOCK(base_tc, GROUP_W_FRINGE, GROUP_H_FRINGE, INPUT3);

	int lx = int(gl_LocalInvocationID.x) + 3;
	int ly = int(gl_LocalInvocationID.y);

	// Output the unmodified pixel. For TFF (current_field_position == 0),
	// we have an extra pixel on the bottom that we're only using for interpolation
	// (it's being output by another workgroup), so we have to add 1.
	vec4 val = temp[(ly + (PREFIX(current_field_position) ^ 1)) * GROUP_W_FRINGE + lx];
	OUTPUT(ivec2(gl_GlobalInvocationID.x, yi), val);

	// a b c d e f g     ↑ y
	//       x           |
	// h i j k l m n     +--> x

	vec4 a = temp[(ly + 1) * GROUP_W_FRINGE + lx - 3];
	vec4 b = temp[(ly + 1) * GROUP_W_FRINGE + lx - 2];
	vec4 c = temp[(ly + 1) * GROUP_W_FRINGE + lx - 1];
	vec4 d = temp[(ly + 1) * GROUP_W_FRINGE + lx];
	vec4 e = temp[(ly + 1) * GROUP_W_FRINGE + lx + 1];
	vec4 f = temp[(ly + 1) * GROUP_W_FRINGE + lx + 2];
	vec4 g = temp[(ly + 1) * GROUP_W_FRINGE + lx + 3];

	vec4 h = temp[ly * GROUP_W_FRINGE + lx - 3];
	vec4 i = temp[ly * GROUP_W_FRINGE + lx - 2];
	vec4 j = temp[ly * GROUP_W_FRINGE + lx - 1];
	vec4 k = temp[ly * GROUP_W_FRINGE + lx];
	vec4 l = temp[ly * GROUP_W_FRINGE + lx + 1];
	vec4 m = temp[ly * GROUP_W_FRINGE + lx + 2];
	vec4 n = temp[ly * GROUP_W_FRINGE + lx + 3];

	// 0 degrees.
	vec4 pred = d + k;
	float score;
	float best_score = DIFF(c, j) + DIFF(d, k) + DIFF(e, l) - 1e-4;

	// -45 degrees.
	score = DIFF(b, k) + DIFF(c, l) + DIFF(d, m);
	if (score < best_score) {
		pred = c + l;
		best_score = score;
	}

	// -63 degrees.
	score = DIFF(a, l) + DIFF(b, m) + DIFF(c, n);
	if (score < best_score) {
		pred = b + m;
		best_score = score;
	}

	// +45 degrees.
	score = DIFF(d, i) + DIFF(e, j) + DIFF(f, k);
	if (score < best_score) {
		pred = e + j;
		best_score = score;
	}

	// +63 degrees.
	score = DIFF(e, h) + DIFF(f, i) + DIFF(g, j);
	if (score < best_score) {
		pred = f + i;
		// best_score isn't used anymore.
	}

	pred *= 0.5f;

	// Temporal prediction (p2) of this pixel based on the previous and next fields.
	//
	//                ↑ y
	//     C   H      |
	//   A   F   K    |
	//     D x I      |
	//   B   G   L    |
	//     E   J      |
	//                +-----> time
	//
	// x is obviously aligned with D and I, so we don't need texcoord
	// adjustment for top/bottom field here, unlike earlier. However, we need
	// to start the block one pixel below since we need E/J, thus the -1 in
	// the y coordinate.
	base_tc = vec2((gl_WorkGroupID.x * uint(GROUP_W) + 0.5f) * PREFIX(inv_width),
	               (gl_WorkGroupID.y * uint(GROUP_H) + (0.5f - 1.0f)) * PREFIX(inv_height));
	lx = int(gl_LocalInvocationID.x);
#if YADIF_ENABLE_SPATIAL_INTERLACING_CHECK
	LOAD_PIXEL_BLOCK(base_tc, GROUP_W, GROUP_H + 2, INPUT2);
	vec4 C = temp[(ly + 2) * GROUP_W + lx];
	vec4 D = temp[(ly + 1) * GROUP_W + lx];
	vec4 E = temp[ ly      * GROUP_W + lx];

	LOAD_PIXEL_BLOCK(base_tc, GROUP_W, GROUP_H + 2, INPUT4);
	vec4 H = temp[(ly + 2) * GROUP_W + lx];
	vec4 I = temp[(ly + 1) * GROUP_W + lx];
	vec4 J = temp[ ly      * GROUP_W + lx];
#else
	// Since spatial interlacing check is not enabled, we only need D
	// and I from the previous and next fields; since they are not shared
	// between the neighboring pixels, they can be straight-up loads.
	vec2 DI_pos = vec2((gl_GlobalInvocationID.x + 0.5f) * PREFIX(inv_width),
	                   (gl_GlobalInvocationID.y + 0.5f) * PREFIX(inv_height));
	vec4 D = INPUT2(DI_pos);
	vec4 I = INPUT4(DI_pos);
#endif

	// Load what we need from the previous field into shared memory,
	// since A/B can be reused between neighboring pixels. We need one
	// line above/below, but we don't need the horizontal fringe.
	LOAD_PIXEL_BLOCK(base_tc, GROUP_W, GROUP_H + 1, INPUT1);
	vec4 A = temp[(ly + 1) * GROUP_W + lx];
	vec4 B = temp[ ly      * GROUP_W + lx];

	// What we need from the current field was loaded earlier.
	vec4 F = d;
	vec4 G = k;

	// Next field.
	LOAD_PIXEL_BLOCK(base_tc, GROUP_W, GROUP_H + 1, INPUT5);
	vec4 K = temp[(ly + 1) * GROUP_W + lx];
	vec4 L = temp[ ly      * GROUP_W + lx];

	// Find temporal differences around this line.
	vec4 tdiff0 = abs(D - I);
	vec4 tdiff1 = abs(A - F) + abs(B - G);  // Actually twice tdiff1.
	vec4 tdiff2 = abs(K - F) + abs(L - G);  // Actually twice tdiff2.
	vec4 diff = max(tdiff0, 0.5f * max(tdiff1, tdiff2));

#if YADIF_ENABLE_SPATIAL_INTERLACING_CHECK
	// Spatial interlacing check.
	// We start by temporally interpolating the current vertical line (p0–p4):
	//
	//     C p0 H      ↑ y
	//       p1        |
	//     D p2 I      |
	//       p3        |
	//     E p4 J      +-----> time
	//
	vec4 p0 = 0.5f * (C + H);
	vec4 p1 = F;
	vec4 p2 = 0.5f * (D + I);
	vec4 p3 = G;
	vec4 p4 = 0.5f * (E + J);

	vec4 max_ = max(max(p2 - p3, p2 - p1), min(p0 - p1, p4 - p3));
	vec4 min_ = min(min(p2 - p3, p2 - p1), max(p0 - p1, p4 - p3));
	diff = max(diff, max(min_, -max_));
#else
	vec4 p2 = 0.5f * (D + I);
#endif

	val = clamp(pred, p2 - diff, p2 + diff);
	OUTPUT(ivec2(gl_GlobalInvocationID.x, yi ^ 1), val);
}

#undef LOAD_PIXEL_BLOCK
#undef DIFF
#undef YADIF_ENABLE_SPATIAL_INTERLACING_CHECK
libmovit8 1.6.1-1 / usr / share / movit / deinterlace_effect.comp