Skip to main content

skip to main content

developerWorks  >  Power Architecture technology  >

Unrolling AltiVec, Part 3: Down and dirty loop optimization

Learn how to tailor your code for AltiVec

developerWorks

Return to article


Listing 2. The AltiVec-optimized vectorized code (color_vec.c)


#include <sys/time.h>
#include <math.h>
#include <stdio.h>

void
inc_hsv(int size, unsigned char *red, unsigned char *green, unsigned char *blue) {
	int i;
	unsigned char *r, *g, *b;
	vector float one_255th = { 1.0 / 255, 1.0 / 255, 1.0 / 255, 1.0 / 255 };
	vector float fh_factor = { 6.0 / 252.0, 6.0 / 252.0,
				   6.0 / 252.0, 6.0 / 252.0 };
	vector float vec_zero = { 0.0 };
	vector float vec_half = { 0.5, 0.5, 0.5, 0.5 };
	vector float vec_unity = { 1.0, 1.0, 1.0, 1.0 };
	vector unsigned char u8_1 = vec_splat_u8(1);
	vector unsigned short u16_1 = vec_splat_u16(1);

	// multiplying converts:
	// 0123456789abcdef
	// to:
	// 000044448888cccc
	// 22226666aaaaeeee
	// 111155559999dddd
	// 33337777bbbbffff

	unsigned char inorder_values[16] = {
		0x00, 0x08, 0x04, 0x0c,
		0x01, 0x09, 0x05, 0x0d,
		0x02, 0x0a, 0x06, 0x0e,
		0x03, 0x0b, 0x07, 0x0f
	};
	vector unsigned char inorder = vec_ld(0, inorder_values);
	unsigned char split_values[16] = {
		0x00, 0x08, 0x01, 0x09,
		0x02, 0x0a, 0x03, 0x0b,
		0x04, 0x0c, 0x05, 0x0d,
		0x06, 0x0e, 0x07, 0x0f
	};
	vector unsigned char split = vec_ld(0, split_values);

	vector unsigned char u8_42 = vec_add(vec_splat_u8(10), vec_splat_u8(11));
	// start it at 21, double to get 42.
	u8_42 = vec_add(u8_42, u8_42);

	for (i = 0; i < size; i += 16) {
		int j;

		register vector unsigned char r16, g16, b16;
		register vector unsigned char max16, min16, delta16;
		vector bool char r_max, g_max, b_max;
		vector unsigned char hue_r16, hue_g16, hue_b16;
		vector unsigned char hue16 = (vector unsigned char) {0};
		vector unsigned char sat16;
		unsigned char delta[16];
		unsigned char hue_r[16];
		unsigned char hue_g[16];
		unsigned char hue_b[16];
		vector unsigned char scratch8;
		vector unsigned short scratch[4];
		vector float fh[4];
		vector float fv[4];
		vector float fs[4];
		vector unsigned char v84;
		unsigned char values[16];
		unsigned char sats[16];
		signed short g_b[16], b_r[16], r_g[16];

		r = red + i;
		g = green + i;
		b = blue + i;
		r16 = vec_ld(0, r);
		g16 = vec_ld(0, g);
		b16 = vec_ld(0, b);

		max16 = vec_max(vec_max(r16, g16), b16);
		min16 = vec_min(vec_min(r16, g16), b16);
		delta16 = vec_sub(max16, min16);

		vec_st(delta16, 0, delta);
		vec_st(max16, 0, values);
		for (j = 0; j < 16; ++j) {
			sats[j] = (255 * delta[j] / values[j]);
		}
		sat16 = vec_ld(0, sats);

		// these indicate which values to take from the
		// result registers
		r_max = vec_cmpeq(max16, r16);
		g_max = vec_cmpeq(max16, g16);
		b_max = vec_cmpeq(max16, b16);

		// precalculate x-y*42 values
		scratch8 = vec_perm(g16, g16, split);
		scratch[0] = vec_mule(scratch8, u8_42);
		scratch[1] = vec_mulo(scratch8, u8_42);
		scratch8 = vec_perm(b16, b16, split);
		scratch[2] = vec_mule(scratch8, u8_42);
		scratch[3] = vec_mulo(scratch8, u8_42);
		vec_st(vec_sub((vector signed short) scratch[0], (vector
			signed short) scratch[2]), 0, g_b);
		vec_st(vec_sub((vector signed short) scratch[1], (vector
			signed short) scratch[3]), 16, g_b);

		scratch8 = vec_perm(b16, b16, split);
		scratch[0] = vec_mule(scratch8, u8_42);
		scratch[1] = vec_mulo(scratch8, u8_42);
		scratch8 = vec_perm(r16, r16, split);
		scratch[2] = vec_mule(scratch8, u8_42);
		scratch[3] = vec_mulo(scratch8, u8_42);
		vec_st(vec_sub((vector signed short) scratch[0], (vector
			signed short) scratch[2]), 0, b_r);
		vec_st(vec_sub((vector signed short) scratch[1], (vector
			signed short) scratch[3]), 16, b_r);

		scratch8 = vec_perm(r16, r16, split);
		scratch[0] = vec_mule(scratch8, u8_42);
		scratch[1] = vec_mulo(scratch8, u8_42);
		scratch8 = vec_perm(g16, g16, split);
		scratch[2] = vec_mule(scratch8, u8_42);
		scratch[3] = vec_mulo(scratch8, u8_42);
		vec_st(vec_sub((vector signed short) scratch[0], (vector
			signed short) scratch[2]), 0, r_g);
		vec_st(vec_sub((vector signed short) scratch[1], (vector
			signed short) scratch[3]), 16, r_g);

		// Hue ranges from 0 to 251, because 252 is divisible by 6,
		// and this eliminates a subtle drifting effect.
		for (j = 0; j < 16; ++j) {
			if (delta[j] == 0) {
				hue_r[j] = hue_g[j] = hue_b[j] = 0;
				continue;
			}
			if (r[j] == values[j]) {
				hue_r[j] = g_b[j] / (float) delta[j];
				if (hue_r[j] > 127)
					hue_r[j] -= 4;
			}
			if (g[j] == values[j]) {
				hue_g[j] = b_r[j] / (float) delta[j];
			}
			if (b[j] == values[j]) {
				hue_b[j] = r_g[j] / (float) delta[j];
			}
		}
		v84 = vec_add(u8_42, u8_42); // 42+42 = 84
		hue_r16 = vec_ld(0, hue_r);
		hue_g16 = vec_add(v84, vec_ld(0, hue_g));
		v84 = vec_add(v84, v84); // == 168
		hue_b16 = vec_add(v84, vec_ld(0, hue_b));

		hue16 = vec_sel(hue16, hue_r16, r_max);
		hue16 = vec_sel(hue16, hue_g16, g_max);
		hue16 = vec_sel(hue16, hue_b16, b_max);

		// we now have hue, sat, and value registers of 16 values at
		// once... unfortunately, we need to multiply by a float
		// value, so some splitting up needs to happen
		hue16 = vec_perm(hue16, hue16, inorder);
		scratch[0] = vec_mule(hue16, u8_1);
		scratch[1] = vec_mulo(hue16, u8_1);
		max16 = vec_perm(max16, max16, inorder);
		fh[0] = vec_madd(vec_ctf(vec_mule(scratch[0], u16_1), 0),
			fh_factor, vec_half);
		fh[1] = vec_madd(vec_ctf(vec_mulo(scratch[0], u16_1), 0),
			fh_factor, vec_half);
		scratch[0] = vec_mule(max16, u8_1);
		fh[2] = vec_madd(vec_ctf(vec_mule(scratch[1], u16_1), 0),
			fh_factor, vec_half);
		fh[3] = vec_madd(vec_ctf(vec_mulo(scratch[1], u16_1), 0),
			fh_factor, vec_half);
		scratch[1] = vec_mulo(max16, u8_1);
		sat16 = vec_perm(sat16, sat16, inorder);
		fv[0] = vec_ctf(vec_mule(scratch[0], u16_1), 0);
		fv[1] = vec_ctf(vec_mulo(scratch[0], u16_1), 0);
		scratch[0] = vec_mule(sat16, u8_1);
		fv[2] = vec_ctf(vec_mule(scratch[1], u16_1), 0);
		fv[3] = vec_ctf(vec_mulo(scratch[1], u16_1), 0);
		scratch[1] = vec_mulo(sat16, u8_1);
		fs[0] = vec_ctf(vec_mule(scratch[0], u16_1), 0);
		fs[1] = vec_ctf(vec_mulo(scratch[0], u16_1), 0);
		fs[2] = vec_ctf(vec_mule(scratch[1], u16_1), 0);
		fs[3] = vec_ctf(vec_mulo(scratch[1], u16_1), 0);

		for (j = 0; j < 4; ++j) {
			vector unsigned int sector;
			vector float remainder;
			vector float floor;
			vector float six;
			vector float sub6;
			vector bool int dosub;
			vector float p, q, t;
			vector float vec255 = { 255.0, 255.0, 255.0, 255.0 };
			float ps[4], qs[4], ts[4];
			unsigned int sectors[4];
			int k;

			six = (vector float) { 6.0, 6.0, 6.0, 6.0 };
			dosub = vec_cmpgt(fh[j], six);
			sub6 = vec_sub(fh[j], six);
			fh[j] = vec_sel(fh[j], sub6, dosub);

			floor = vec_floor(fh[j]);
			sector = vec_ctu(floor, 0);
			vec_st(sector, 0, sectors);
			remainder = vec_sub(fh[j], floor);
			{
				/* p = (255 - s) * (max / 255);
				 * q = (255 - s * f) * (max / 255);
				 * t = (255 - s * (1 - f)) * (max / 255);
				 */
				vector float max_255;
				max_255 = vec_madd(fv[j], one_255th, vec_zero);
				p = vec_sub(vec255, fs[j]);
				q = vec_sub(vec255, vec_madd(fs[j], remainder, vec_zero));
				t = vec_sub(vec255,
					vec_madd(fs[j],
						vec_sub(vec_unity, remainder),
						vec_zero));

				p = vec_madd(p, max_255, vec_zero);
				q = vec_madd(q, max_255, vec_zero);
				t = vec_madd(t, max_255, vec_zero);
			}
			vec_st(p, 0, ps);
			vec_st(q, 0, qs);
			vec_st(t, 0, ts);
			for (k = 0; k < 4; ++k) {
				int index = j * 4 + k;

				switch (sectors[k]) {
					case 0:
						r[index] = values[index];
						g[index] = ts[k];
						b[index] = ps[k];
						break;
					case 1:
						r[index] = qs[k];
						g[index] = values[index];
						b[index] = ps[k];
						break;
					case 2:
						r[index] = ps[k];
						g[index] = values[index];
						b[index] = ts[k];
						break;
					case 3:
						r[index] = ps[k];
						g[index] = qs[k];
						b[index] = values[index];
						break;
					case 4:
						r[index] = ts[k];
						g[index] = ps[k];
						b[index] = values[index];
						break;
					default:		// case 5:
						r[index] = values[index];
						g[index] = ps[k];
						b[index] = qs[k];
						break;
				}
			}

		}
	}
}

struct { int r, g, b; } colors[] = {
	{ 0xff, 0x00, 0x00 },
	{ 0xff, 0x2f, 0x00 },
	{ 0xff, 0x5f, 0x00 },
	{ 0xff, 0x8f, 0x00 },
	{ 0xff, 0xbf, 0x00 },
	{ 0xff, 0xef, 0x00 },
	{ 0xdf, 0xff, 0x00 },
	{ 0xaf, 0xff, 0x00 },
	{ 0x7f, 0xff, 0x00 },
	{ 0x4f, 0xff, 0x00 },
	{ 0x1f, 0xff, 0x00 },
	{ 0x00, 0xff, 0x0f },
	{ 0x00, 0xff, 0x3f },
	{ 0x00, 0xff, 0x6f },
	{ 0x00, 0xff, 0x9f },
	{ 0x00, 0xff, 0xcf },
	{ 0x00, 0xff, 0xff },
	{ 0x00, 0xcf, 0xff },
	{ 0x00, 0x9f, 0xff },
	{ 0x00, 0x6f, 0xff },
	{ 0x00, 0x3f, 0xff },
	{ 0x00, 0x0f, 0xff },
	{ 0x1f, 0x00, 0xff },
	{ 0x4f, 0x00, 0xff },
	{ 0x7f, 0x00, 0xff },
	{ 0xaf, 0x00, 0xff },
	{ 0xdf, 0x00, 0xff },
	{ 0xff, 0x00, 0xef },
	{ 0xff, 0x00, 0xbf },
	{ 0xff, 0x00, 0x8f },
	{ 0xff, 0x00, 0x5f },
	{ 0xff, 0x00, 0x2f },
};

unsigned char red[65536], green[65536], blue[65536];

int
main(void) {
	struct timezone dontcare = { 0, 0 };
	struct timeval before, after;
	long long microsec;
	int i, j = 0;

	// set up arrays from our little rainbow
	for (i = 0; i < 65536; ++i) {
		red[i] = colors[j].r;
		green[i] = colors[j].g;
		blue[i] = colors[j].b;
		++j;
		if (j == (sizeof(colors) / sizeof(*colors))) {
			j = 0;
		}
	}
	for (j = 0; j < 32; j += 2)
		printf("(%d) { %02x %02x %02x }\n", j, red[j], green[j], blue[j]);

	gettimeofday(&before, &dontcare);
	for (i = 0; i < 200; ++i) {
		inc_hsv(65536, red, green, blue);
	}
	gettimeofday(&after, &dontcare);
	microsec = (after.tv_usec - before.tv_usec) +
		1000000 * (after.tv_sec - before.tv_sec);
	for (j = 0; j < 32; j += 2)
		printf("(%d) { %02x %02x %02x }\n", j, red[j], green[j], blue[j]);
	printf("%lld microseconds\n", microsec);
	return 0;
}


Return to article