pd-mark RWCP OpenMP C API Specification


例("OpenMP C and C++ Application Program Interface Version 1.0")
A.1 単純ループ
#pragma omp parallel for
	for(i = 1; i < n; i++)
		 b[i] = (a[i] + a[i-1]) / 2.0;
A.2 条件コンパイル
#ifdef _OPENMP
	printf("Compiled by an OpenMP-compliant implementation.\n");
#endif

#if defined(_OPENMP) && defined(VERBOSE)
	printf("Compiled by an OpenMP-compliant implementation.\n");
#endif
A.3 parallel regionの使い方
#pragma omp parallel shared(x, npoints) private(iam, np, ipoints)
{
	iam = omp_get_thread_num();
	np = omp_get_num_threads();
	ipoints = npoints / np;
	subdomain(x, iam, ipoints);
}
A.4 nowait clauseの使い方
#pragma omp parallel 
{
#pragma omp for nowait
	for(i = 1; i < n; i++)
		b[i] = (a[i] + a[i-1]) / 2.0;

#pragma omp for nowait
	for(i = 0; i < m; i++)
		y[i] = sqrt(z[i]);
}
A.5 critical directiveの使い方
#pragma omp parallel shared(x, y) private(x_next, y_next)
{
#pragma omp critical (xaxis)
	x_next = dequeue(x);
	work(x_next);
#pragma omp critical (yaxis)
	y_next = dequeue(y);
	work(y_next);
}
A.6 lastprivate clauseの使い方
#pragma omp parallel 
{
	#pragma omp for lastprivate(i)
	for(i = 0; i < n-1; i++)
		a[i] = b[i] + b[i+1]
}
a[i] = b[i];
A.7 reduction clauseの使い方
#pragma omp prallel for private(i) shared(x, y, n) reduction(+: a, b)
	for(i = 0; i < n; i++){
		a = a + x[i];
		b = b + y[i];
	}
A.8 parallel sectionsの使い方
#pragma omp parallel sections
{
#pragma omp section
	xaxis();
#pragma omp section
	yaxis();
#pragma omp section
	zaxis();
}
A.9 single directiveの使い方
#pragma omp parallel
{
#pragma omp single
	printf("Beginning work1.\n");
	work1();
#pragma omp single
	printf("Finishing work1.\n");
#pragma omp single nowait
	printf("Finished work1 and beginneng work2.\n");
	work2();
}

A.10 ORDERED の使い方
#pragma omp for ordered schedule(dynamic)
	for(i = lb; i < ub; i += st)
		work(i);

void work(int k)
{
#pragma omp ordered
	printf(" %d", k);
}
A.11 スレッド数を指定する例
	omp_set_dynamic(0);
	omp_set_num_threads(16);
#pragma omp parallel shared(x, npoints) private(iam, ipoints)
{
	if (omp_get_num_threads() != 16)
		abort();
	iam = omp_get_thread_num();
	ipoints = npoints / 16;
	do_by_16(x, iam, ipoints);
}
A.12 atomic directiveの使い方
#pragma omp parallel for shared(x, y, index, n)
	for(i = 0; i < n; i++){
	#pragma omp atomic
		x[index[i]] += work1(i);
		y[i] += work2(i);
	}
注: y[i]+= work2(i) は、atomicではない。
A.13 flush directiveの使い方
#pragma omp parallel private(iam, neighbor) shared(work, sync)
{
	iam = omp_get_thread_num();
	sync[iam] = 0;
#pragma omp barrier

	/* Do computation into my portion of work array */
	work[iam] = ...;

	/* Announce that I am done with my work
	 * The first flush ensures that my work is made visible before sync.
	 * The second flush ensures that sync is made visible.
	 */
#pragma omp flush (work)
	sync[iam] = 1;
#pragma omp flush(sync)

	/* Wait for neighbor */
	neighbor = ( iam > 0? iam: omp_get_num_threads() ) - 1;
	while(sync[neighbor] == 0){
	#pragma omp flush (sync)
	}

	/* Read neighbor's values of work array */
	... = work[neighbor];
}
A.14 listを用いないflush directiveの使い方
int x, *p = &x;

void f1(int *q)
{
	*q = 1;
#pragma omp flush
	// x, p, and *q are flushed
	//  because they are shared and accessible
}

void f2(int *q)
{
	*q = 2;
#pragma omp barrier
	// a barrier implies a flush
	// x, p, and *q are flushed
	//  because they are shared and accessible
}

int g(int n)
{
	int i = 1, j, sum = 0;
	*p = 1;
#pragma omp parallel reduction (+: sum)
	{
		f1(&j);
		// i and n were not flushed
		//  because they were not accessible in f1
		sum += j;
		f2(&j);
		// i and n were not flushed
		//  because they were not accessible in f2
		// j was flushed because it was accessible
		sum += i + j + *p + n;
	}
	return sum;
}
A.15 実行スレッド数の使い方
#pragma omp parallel private(i)
{
	i = omp_get_thread_num();
	work(i);
}
A.16 lockの使い方
#include 
int main()
{
	omp_lock_t lck;
	int id;

	omp_init_lock(&lck);
#pragma omp parallel shared(lck) private(id)
	{
		id = omp_get_thread_num();

		omp_set_lock(&lck);
		printf("My thread id is %d.\n", id);	
		omp_unset_lock(&lck);

		while( !omp_test_lock(&lck) ){
			skip(id);	/* we do not yet have the lock,
					   so we must do something else */
		}
		work(id);	/* we now have the lock
				   and can do the work */

		omp_unset_lock(&lck);
	}

	omp_destroy_lock(&lck);
}
A.17 ネストしたlock
#include 
typedef struct {
	int a, b;
	omp_nest_lock_t lck;
} pair;

void incr_a(pair *p, int a)
{
	// Called only from incr_pair, no need to lock.
	p->a += a;
}

void incr_b(pair *p, int b)
{
	// Called both from incr_pair and elsewhere,
	// so need a nestable lock.

	omp_set_nest_lock(&p->lck);
	p->b += b;
	omp_unset_nest_lock(&p->lck);
}

void incr_pair(pair *p, int a, int b)
{
	omp_set_nest_lock(&p->lck);
	incr_a(p, a);
	incr_b(p, b);
	omp_unset_nest_lock(&p->lck);
}

void f(pair *p)
{
	extern int work1(), work2(), work3();
#pragma omp parallel sections
	{
	#pragma omp section
		incr_pair(p, work1(), work2());
	#pragma omp section
		incr_pair(p, work3());
	}
}
A.18 多重ループの並列化
#pragma omp parallel default(shared)
{
#pragma omp for
	for( i = 0 ; i < n ; i++ ){
#pragma omp parallel shared(i, n)
	{
	#pragma omp for
		for( j = 0 ; j < N ; j++ )
			work(i, j);
	}
	}
}

#pragma omp parallel default(shared)
{
#pragma omp for
	for( i = 0 ; i < n ; i++ )
		work1(i, n);
}

work1(int i, int n)
{
	int j;
#pragma omp parallel default(shared)
	{
	#pragma omp for
	for( j = 0 ; j < n ; j++ )
		work2(i, j);
	}
	return;
}
A.19 誤った使い方
void wrong1(int n)
{
#pragma omp parallel default(shared)
{
	int i, j;
#pragma omp for
	for( i = 0 ; i < n ; i++ ){
#pragma omp for
		for( j = 0 ; j < n ; j++ )
			work(i, j);
	}
}
}

void wrong2(int n)
{
#pragma omp parallel default(shared)
{
	int i;
#pragma omp for
	for( i = 0 ; i < n ; i++ )
		work1(i, n);
}
}

void work1(int i, int n)
{
	int j;
#pragma omp for
	for( j = 0 ; j < n ; j++ )
		work2(i, j);
}

void wrong3(int n)
{
#pragma omp parallel default(shared)
{
	int i;
#pragma omp for
	for( i = 0 ; i < n ; i++ ){
	#pragma omp single
		work(i);
	}
}
}

void wrong4(int n)
{
#pragma omp parallel default(shared)
{
	int i;
#pragma omp for
	for( i = 0 ; i < n ; i++ ){
		work1(i);
	#pragma omp barrier
		work2(i);
	}
}
}

void wrong5()
{
#pragma omp parallel default(shared)
{
#pragma omp critical
	{
		work1();
	#pragma omp barrier
		work2();
	}
}
}

void wrong6()
{
#pragma omp parallel
{
	setup();
#pragma omp single
	{
		work1();
	#pragma omp barrier
		work2();
	}
	finish();
}
}
A.20 barrier同期
int main()
{
	sub1(2);
	sub2(2);
}

void sub1(int n)
{
	int i;
#pragma omp parallel private(i) shared(n)
{
#pragma omp for
	for( i = 0 ; i < n ; i++ )
		sub2(i);
}
}

void sub2(int k)
{
#pragma omp parallel shared(k)
	sub3(k);
}

void sub3(int n)
{
	work1(n);
#pragma omp barrier
	work2(n);
}
A.21 private clauseのスコープ
	int  i, j;
	i = 1;
	j = 2;
#pragma omp parallel private(i) firstprivate(j)
	{
		i = 3;
		j = j + 2;
	}
	printf("i=%d, j=%d\n", i, j);	/* these values are undefined! */
A.22 default(none) clause
int x, y, z[1000];
#pragma omp threadprivate(x)

void fun(int a)
{
	const int c = 1;
	int i = 0;
#pragma omp parallel default(none) private(a) shared(z)
{
	int j = omp_get_num_thread();
		// O.K. - j is declared within parallel region
	a = z[j];	// O.K. - a is listed in private clause
			//      - z is listed in shared clause
	x = c;		// O.K. - x is threadprivate
			//      - c has const-qualified type
	z[i] = y;	// Error - cannot reference i or y here

	#pragma omp for firstprivate(y)
	for( i = 0 ; i < 10 ; i++ ){
		z[i] = y;	// O.K. - i is the loop control variable
				//      - y is listed in firstprivate clause
	}
	z[i] = y;	// Error - cannot reference i or y here
}
}