You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1045 lines
28 KiB

  1. /*
  2. * Copyright 2013-2015 Formal Methods and Tools, University of Twente
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include <errno.h> // for errno
  17. #include <sched.h> // for sched_getaffinity
  18. #include <stdio.h> // for fprintf
  19. #include <stdlib.h> // for memalign, malloc
  20. #include <string.h> // for memset
  21. #include <sys/mman.h> // for mprotect
  22. #include <sys/time.h> // for gettimeofday
  23. #include <pthread.h>
  24. #include <unistd.h>
  25. #include <assert.h>
  26. #include <lace.h>
  27. #ifndef USE_HWLOC
  28. #define USE_HWLOC 0
  29. #endif
  30. #if USE_HWLOC
  31. #include <hwloc.h>
  32. #endif
  33. // public Worker data
  34. static Worker **workers;
  35. static size_t default_stacksize = 0; // set by lace_init
  36. static size_t default_dqsize = 100000;
  37. #if USE_HWLOC
  38. static hwloc_topology_t topo;
  39. static unsigned int n_nodes, n_cores, n_pus;
  40. #endif
  41. static int verbosity = 0;
  42. static int n_workers = 0;
  43. static int enabled_workers = 0;
  44. // private Worker data (just for stats at end )
  45. static WorkerP **workers_p;
  46. // set to 0 when quitting
  47. static int lace_quits = 0;
  48. // for storing private Worker data
  49. #ifdef __linux__ // use gcc thread-local storage (i.e. __thread variables)
  50. static __thread WorkerP *current_worker;
  51. #else
  52. static pthread_key_t worker_key;
  53. #endif
  54. static pthread_attr_t worker_attr;
  55. static pthread_cond_t wait_until_done = PTHREAD_COND_INITIALIZER;
  56. static pthread_mutex_t wait_until_done_mutex = PTHREAD_MUTEX_INITIALIZER;
  57. struct lace_worker_init
  58. {
  59. void* stack;
  60. size_t stacksize;
  61. };
  62. static struct lace_worker_init *workers_init;
  63. lace_newframe_t lace_newframe;
  64. WorkerP*
  65. lace_get_worker()
  66. {
  67. #ifdef __linux__
  68. return current_worker;
  69. #else
  70. return (WorkerP*)pthread_getspecific(worker_key);
  71. #endif
  72. }
  73. Task*
  74. lace_get_head(WorkerP *self)
  75. {
  76. Task *dq = self->dq;
  77. if (dq[0].thief == 0) return dq;
  78. if (dq[1].thief == 0) return dq+1;
  79. if (dq[2].thief == 0) return dq+2;
  80. size_t low = 2;
  81. size_t high = self->end - self->dq;
  82. for (;;) {
  83. if (low*2 >= high) {
  84. break;
  85. } else if (dq[low*2].thief == 0) {
  86. high=low*2;
  87. break;
  88. } else {
  89. low*=2;
  90. }
  91. }
  92. while (low < high) {
  93. size_t mid = low + (high-low)/2;
  94. if (dq[mid].thief == 0) high = mid;
  95. else low = mid + 1;
  96. }
  97. return dq+low;
  98. }
  99. size_t
  100. lace_workers()
  101. {
  102. return n_workers;
  103. }
  104. size_t
  105. lace_default_stacksize()
  106. {
  107. return default_stacksize;
  108. }
  109. #ifndef cas
  110. #define cas(ptr, old, new) (__sync_bool_compare_and_swap((ptr),(old),(new)))
  111. #endif
  112. #if LACE_PIE_TIMES
  113. static uint64_t count_at_start, count_at_end;
  114. static long long unsigned us_elapsed_timer;
  115. static void
  116. us_elapsed_start(void)
  117. {
  118. struct timeval now;
  119. gettimeofday(&now, NULL);
  120. us_elapsed_timer = now.tv_sec * 1000000LL + now.tv_usec;
  121. }
  122. static long long unsigned
  123. us_elapsed(void)
  124. {
  125. struct timeval now;
  126. long long unsigned t;
  127. gettimeofday( &now, NULL );
  128. t = now.tv_sec * 1000000LL + now.tv_usec;
  129. return t - us_elapsed_timer;
  130. }
  131. #endif
  132. #if USE_HWLOC
  133. // Lock used only during parallel lace_init_worker...
  134. static volatile int __attribute__((aligned(64))) lock = 0;
  135. static inline void
  136. lock_acquire()
  137. {
  138. while (1) {
  139. while (lock) {}
  140. if (cas(&lock, 0, 1)) return;
  141. }
  142. }
  143. static inline void
  144. lock_release()
  145. {
  146. lock=0;
  147. }
  148. #endif
  149. /* Barrier */
  150. #define BARRIER_MAX_THREADS 128
  151. typedef union __attribute__((__packed__))
  152. {
  153. volatile size_t val;
  154. char pad[LINE_SIZE];
  155. } asize_t;
  156. typedef struct {
  157. volatile int __attribute__((aligned(LINE_SIZE))) count;
  158. volatile int __attribute__((aligned(LINE_SIZE))) wait;
  159. /* the following is needed only for destroy: */
  160. asize_t entered[BARRIER_MAX_THREADS];
  161. } barrier_t;
  162. barrier_t lace_bar;
  163. void
  164. lace_barrier()
  165. {
  166. int id = lace_get_worker()->worker;
  167. lace_bar.entered[id].val = 1; // signal entry
  168. int wait = lace_bar.wait;
  169. if (enabled_workers == __sync_add_and_fetch(&lace_bar.count, 1)) {
  170. lace_bar.count = 0; // reset counter
  171. lace_bar.wait = 1 - wait; // flip wait
  172. lace_bar.entered[id].val = 0; // signal exit
  173. } else {
  174. while (wait == lace_bar.wait) {} // wait
  175. lace_bar.entered[id].val = 0; // signal exit
  176. }
  177. }
  178. static void
  179. lace_barrier_init()
  180. {
  181. assert(n_workers <= BARRIER_MAX_THREADS);
  182. memset(&lace_bar, 0, sizeof(barrier_t));
  183. }
  184. static void
  185. lace_barrier_destroy()
  186. {
  187. // wait for all to exit
  188. for (int i=0; i<n_workers; i++) {
  189. while (1 == lace_bar.entered[i].val) {}
  190. }
  191. }
  192. void
  193. lace_init_worker(int worker, size_t dq_size)
  194. {
  195. Worker *wt = NULL;
  196. WorkerP *w = NULL;
  197. if (dq_size == 0) dq_size = default_dqsize;
  198. #if USE_HWLOC
  199. // Get our logical processor
  200. hwloc_obj_t pu = hwloc_get_obj_by_type(topo, HWLOC_OBJ_PU, worker % n_pus);
  201. // Pin our thread...
  202. hwloc_set_cpubind(topo, pu->cpuset, HWLOC_CPUBIND_THREAD);
  203. // Allocate memory on our node...
  204. lock_acquire();
  205. wt = (Worker *)hwloc_alloc_membind(topo, sizeof(Worker), pu->cpuset, HWLOC_MEMBIND_BIND, 0);
  206. w = (WorkerP *)hwloc_alloc_membind(topo, sizeof(WorkerP), pu->cpuset, HWLOC_MEMBIND_BIND, 0);
  207. if (wt == NULL || w == NULL || (w->dq = (Task*)hwloc_alloc_membind(topo, dq_size * sizeof(Task), pu->cpuset, HWLOC_MEMBIND_BIND, 0)) == NULL) {
  208. fprintf(stderr, "Lace error: Unable to allocate memory for the Lace worker!\n");
  209. exit(1);
  210. }
  211. lock_release();
  212. #else
  213. // Allocate memory...
  214. if (posix_memalign((void**)&wt, LINE_SIZE, sizeof(Worker)) ||
  215. posix_memalign((void**)&w, LINE_SIZE, sizeof(WorkerP)) ||
  216. posix_memalign((void**)&w->dq, LINE_SIZE, dq_size * sizeof(Task))) {
  217. fprintf(stderr, "Lace error: Unable to allocate memory for the Lace worker!\n");
  218. exit(1);
  219. }
  220. #endif
  221. // Initialize public worker data
  222. wt->dq = w->dq;
  223. wt->ts.v = 0;
  224. wt->allstolen = 0;
  225. wt->movesplit = 0;
  226. // Initialize private worker data
  227. w->_public = wt;
  228. w->end = w->dq + dq_size;
  229. w->split = w->dq;
  230. w->allstolen = 0;
  231. w->worker = worker;
  232. #if USE_HWLOC
  233. w->pu = worker % n_pus;
  234. #else
  235. w->pu = -1;
  236. #endif
  237. w->enabled = 1;
  238. if (workers_init[worker].stack != 0) {
  239. w->stack_trigger = ((size_t)workers_init[worker].stack) + workers_init[worker].stacksize/20;
  240. } else {
  241. w->stack_trigger = 0;
  242. }
  243. #if LACE_COUNT_EVENTS
  244. // Reset counters
  245. { int k; for (k=0; k<CTR_MAX; k++) w->ctr[k] = 0; }
  246. #endif
  247. // Set pointers
  248. #ifdef __linux__
  249. current_worker = w;
  250. #else
  251. pthread_setspecific(worker_key, w);
  252. #endif
  253. workers[worker] = wt;
  254. workers_p[worker] = w;
  255. // Synchronize with others
  256. lace_barrier();
  257. #if LACE_PIE_TIMES
  258. w->time = gethrtime();
  259. w->level = 0;
  260. #endif
  261. }
  262. #if defined(__APPLE__) && !defined(pthread_barrier_t)
  263. typedef int pthread_barrierattr_t;
  264. typedef struct
  265. {
  266. pthread_mutex_t mutex;
  267. pthread_cond_t cond;
  268. int count;
  269. int tripCount;
  270. } pthread_barrier_t;
  271. static int
  272. pthread_barrier_init(pthread_barrier_t *barrier, const pthread_barrierattr_t *attr, unsigned int count)
  273. {
  274. if(count == 0)
  275. {
  276. errno = EINVAL;
  277. return -1;
  278. }
  279. if(pthread_mutex_init(&barrier->mutex, 0) < 0)
  280. {
  281. return -1;
  282. }
  283. if(pthread_cond_init(&barrier->cond, 0) < 0)
  284. {
  285. pthread_mutex_destroy(&barrier->mutex);
  286. return -1;
  287. }
  288. barrier->tripCount = count;
  289. barrier->count = 0;
  290. return 0;
  291. (void)attr;
  292. }
  293. static int
  294. pthread_barrier_destroy(pthread_barrier_t *barrier)
  295. {
  296. pthread_cond_destroy(&barrier->cond);
  297. pthread_mutex_destroy(&barrier->mutex);
  298. return 0;
  299. }
  300. static int
  301. pthread_barrier_wait(pthread_barrier_t *barrier)
  302. {
  303. pthread_mutex_lock(&barrier->mutex);
  304. ++(barrier->count);
  305. if(barrier->count >= barrier->tripCount)
  306. {
  307. barrier->count = 0;
  308. pthread_cond_broadcast(&barrier->cond);
  309. pthread_mutex_unlock(&barrier->mutex);
  310. return 1;
  311. }
  312. else
  313. {
  314. pthread_cond_wait(&barrier->cond, &(barrier->mutex));
  315. pthread_mutex_unlock(&barrier->mutex);
  316. return 0;
  317. }
  318. }
  319. #endif // defined(__APPLE__) && !defined(pthread_barrier_t)
  320. static pthread_barrier_t suspend_barrier;
  321. static volatile int must_suspend = 0, suspended = 0;
  322. void
  323. lace_suspend()
  324. {
  325. if (suspended == 0) {
  326. suspended = 1;
  327. must_suspend = 1;
  328. lace_barrier();
  329. must_suspend = 0;
  330. }
  331. }
  332. void
  333. lace_resume()
  334. {
  335. if (suspended == 1) {
  336. suspended = 0;
  337. pthread_barrier_wait(&suspend_barrier);
  338. }
  339. }
  340. /**
  341. * With set_workers, all workers 0..(N-1) are enabled and N..max are disabled.
  342. * You can never disable the current worker or reduce the number of workers below 1.
  343. */
  344. void
  345. lace_disable_worker(int worker)
  346. {
  347. int self = lace_get_worker()->worker;
  348. if (worker == self) return;
  349. if (workers_p[worker]->enabled == 1) {
  350. workers_p[worker]->enabled = 0;
  351. enabled_workers--;
  352. }
  353. }
  354. void
  355. lace_enable_worker(int worker)
  356. {
  357. int self = lace_get_worker()->worker;
  358. if (worker == self) return;
  359. if (workers_p[worker]->enabled == 0) {
  360. workers_p[worker]->enabled = 1;
  361. enabled_workers++;
  362. }
  363. }
  364. void
  365. lace_set_workers(int workercount)
  366. {
  367. if (workercount < 1) workercount = 1;
  368. if (workercount > n_workers) workercount = n_workers;
  369. enabled_workers = workercount;
  370. int self = lace_get_worker()->worker;
  371. if (self >= workercount) workercount--;
  372. for (int i=0; i<n_workers; i++) {
  373. workers_p[i]->enabled = (i < workercount || i == self) ? 1 : 0;
  374. }
  375. }
  376. int
  377. lace_enabled_workers()
  378. {
  379. return enabled_workers;
  380. }
  381. static inline uint32_t
  382. rng(uint32_t *seed, int max)
  383. {
  384. uint32_t next = *seed;
  385. next *= 1103515245;
  386. next += 12345;
  387. *seed = next;
  388. return next % max;
  389. }
  390. VOID_TASK_IMPL_0(lace_steal_random)
  391. {
  392. Worker *victim = workers[(__lace_worker->worker + 1 + rng(&__lace_worker->seed, n_workers-1)) % n_workers];
  393. YIELD_NEWFRAME();
  394. PR_COUNTSTEALS(__lace_worker, CTR_steal_tries);
  395. Worker *res = lace_steal(__lace_worker, __lace_dq_head, victim);
  396. if (res == LACE_STOLEN) {
  397. PR_COUNTSTEALS(__lace_worker, CTR_steals);
  398. } else if (res == LACE_BUSY) {
  399. PR_COUNTSTEALS(__lace_worker, CTR_steal_busy);
  400. }
  401. }
  402. VOID_TASK_IMPL_1(lace_steal_random_loop, int*, quit)
  403. {
  404. while(!(*(volatile int*)quit)) {
  405. lace_steal_random();
  406. if (must_suspend) {
  407. lace_barrier();
  408. do {
  409. pthread_barrier_wait(&suspend_barrier);
  410. } while (__lace_worker->enabled == 0);
  411. }
  412. }
  413. }
  414. static lace_startup_cb main_cb;
  415. static void*
  416. lace_main_wrapper(void *arg)
  417. {
  418. lace_init_worker(0, 0);
  419. WorkerP *self = lace_get_worker();
  420. #if LACE_PIE_TIMES
  421. self->time = gethrtime();
  422. #endif
  423. lace_time_event(self, 1);
  424. main_cb(self, self->dq, arg);
  425. lace_exit();
  426. pthread_cond_broadcast(&wait_until_done);
  427. return NULL;
  428. }
  429. VOID_TASK_IMPL_1(lace_steal_loop, int*, quit)
  430. {
  431. // Determine who I am
  432. const int worker_id = __lace_worker->worker;
  433. // Prepare self, victim
  434. Worker ** const self = &workers[worker_id];
  435. Worker **victim = self;
  436. #if LACE_PIE_TIMES
  437. __lace_worker->time = gethrtime();
  438. #endif
  439. uint32_t seed = worker_id;
  440. unsigned int n = n_workers;
  441. int i=0;
  442. while(*(volatile int*)quit == 0) {
  443. // Select victim
  444. if( i>0 ) {
  445. i--;
  446. victim++;
  447. if (victim == self) victim++;
  448. if (victim >= workers + n) victim = workers;
  449. if (victim == self) victim++;
  450. } else {
  451. i = rng(&seed, 40); // compute random i 0..40
  452. victim = workers + (rng(&seed, n-1) + worker_id + 1) % n;
  453. }
  454. PR_COUNTSTEALS(__lace_worker, CTR_steal_tries);
  455. Worker *res = lace_steal(__lace_worker, __lace_dq_head, *victim);
  456. if (res == LACE_STOLEN) {
  457. PR_COUNTSTEALS(__lace_worker, CTR_steals);
  458. } else if (res == LACE_BUSY) {
  459. PR_COUNTSTEALS(__lace_worker, CTR_steal_busy);
  460. }
  461. YIELD_NEWFRAME();
  462. if (must_suspend) {
  463. lace_barrier();
  464. do {
  465. pthread_barrier_wait(&suspend_barrier);
  466. } while (__lace_worker->enabled == 0);
  467. }
  468. }
  469. }
  470. static void*
  471. lace_default_worker(void* arg)
  472. {
  473. lace_init_worker((size_t)arg, 0);
  474. WorkerP *__lace_worker = lace_get_worker();
  475. Task *__lace_dq_head = __lace_worker->dq;
  476. lace_steal_loop(&lace_quits);
  477. lace_time_event(__lace_worker, 9);
  478. lace_barrier();
  479. return NULL;
  480. }
  481. pthread_t
  482. lace_spawn_worker(int worker, size_t stacksize, void* (*fun)(void*), void* arg)
  483. {
  484. // Determine stack size
  485. if (stacksize == 0) stacksize = default_stacksize;
  486. size_t pagesize = sysconf(_SC_PAGESIZE);
  487. stacksize = (stacksize + pagesize - 1) & ~(pagesize - 1); // ceil(stacksize, pagesize)
  488. #if USE_HWLOC
  489. // Get our logical processor
  490. hwloc_obj_t pu = hwloc_get_obj_by_type(topo, HWLOC_OBJ_PU, worker % n_pus);
  491. // Allocate memory for the program stack
  492. lock_acquire();
  493. void *stack_location = hwloc_alloc_membind(topo, stacksize + pagesize, pu->cpuset, HWLOC_MEMBIND_BIND, 0);
  494. lock_release();
  495. if (stack_location == 0) {
  496. fprintf(stderr, "Lace error: Unable to allocate memory for the pthread stack!\n");
  497. exit(1);
  498. }
  499. #else
  500. void *stack_location = mmap(NULL, stacksize + pagesize, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0);
  501. if (stack_location == MAP_FAILED) {
  502. fprintf(stderr, "Lace error: Cannot allocate program stack: %s!\n", strerror(errno));
  503. exit(1);
  504. }
  505. #endif
  506. if (0 != mprotect(stack_location, pagesize, PROT_NONE)) {
  507. fprintf(stderr, "Lace error: Unable to protect the allocated program stack with a guard page!\n");
  508. exit(1);
  509. }
  510. stack_location = (uint8_t *)stack_location + pagesize; // skip protected page.
  511. if (0 != pthread_attr_setstack(&worker_attr, stack_location, stacksize)) {
  512. fprintf(stderr, "Lace error: Unable to set the pthread stack in Lace!\n");
  513. exit(1);
  514. }
  515. workers_init[worker].stack = stack_location;
  516. workers_init[worker].stacksize = stacksize;
  517. if (fun == 0) {
  518. fun = lace_default_worker;
  519. arg = (void*)(size_t)worker;
  520. }
  521. pthread_t res;
  522. pthread_create(&res, &worker_attr, fun, arg);
  523. return res;
  524. }
  525. static int
  526. get_cpu_count()
  527. {
  528. #if USE_HWLOC
  529. int count = hwloc_get_nbobjs_by_type(topo, HWLOC_OBJ_PU);
  530. #elif defined(sched_getaffinity)
  531. /* Best solution: find actual available cpus */
  532. cpu_set_t cs;
  533. CPU_ZERO(&cs);
  534. sched_getaffinity(0, sizeof(cs), &cs);
  535. int count = CPU_COUNT(&cs);
  536. #elif defined(_SC_NPROCESSORS_ONLN)
  537. /* Fallback */
  538. int count = sysconf(_SC_NPROCESSORS_ONLN);
  539. #else
  540. /* Okay... */
  541. int count = 1;
  542. #endif
  543. return count < 1 ? 1 : count;
  544. }
  545. void
  546. lace_set_verbosity(int level)
  547. {
  548. verbosity = level;
  549. }
  550. void
  551. lace_init(int n, size_t dqsize)
  552. {
  553. #if USE_HWLOC
  554. hwloc_topology_init(&topo);
  555. hwloc_topology_load(topo);
  556. n_nodes = hwloc_get_nbobjs_by_type(topo, HWLOC_OBJ_NODE);
  557. n_cores = hwloc_get_nbobjs_by_type(topo, HWLOC_OBJ_CORE);
  558. n_pus = hwloc_get_nbobjs_by_type(topo, HWLOC_OBJ_PU);
  559. #endif
  560. // Initialize globals
  561. n_workers = n;
  562. if (n_workers == 0) n_workers = get_cpu_count();
  563. enabled_workers = n_workers;
  564. if (dqsize != 0) default_dqsize = dqsize;
  565. lace_quits = 0;
  566. // Create barrier for all workers
  567. lace_barrier_init();
  568. // Create suspend barrier
  569. pthread_barrier_init(&suspend_barrier, NULL, n_workers);
  570. // Allocate array with all workers
  571. if (posix_memalign((void**)&workers, LINE_SIZE, n_workers*sizeof(Worker*)) != 0 ||
  572. posix_memalign((void**)&workers_p, LINE_SIZE, n_workers*sizeof(WorkerP*)) != 0) {
  573. fprintf(stderr, "Lace error: unable to allocate memory!\n");
  574. exit(1);
  575. }
  576. // Create pthread key
  577. #ifndef __linux__
  578. pthread_key_create(&worker_key, NULL);
  579. #endif
  580. // Prepare structures for thread creation
  581. pthread_attr_init(&worker_attr);
  582. // Set contention scope to system (instead of process)
  583. pthread_attr_setscope(&worker_attr, PTHREAD_SCOPE_SYSTEM);
  584. // Get default stack size
  585. if (pthread_attr_getstacksize(&worker_attr, &default_stacksize) != 0) {
  586. fprintf(stderr, "Lace warning: pthread_attr_getstacksize returned error!\n");
  587. default_stacksize = 1048576; // 1 megabyte default
  588. }
  589. if (verbosity) {
  590. #if USE_HWLOC
  591. fprintf(stderr, "Initializing Lace, %u nodes, %u cores, %u logical processors, %d workers.\n", n_nodes, n_cores, n_pus, n_workers);
  592. #else
  593. fprintf(stderr, "Initializing Lace, %d workers.\n", n_workers);
  594. #endif
  595. }
  596. // Prepare lace_init structure
  597. workers_init = (struct lace_worker_init*)calloc(1, sizeof(struct lace_worker_init) * n_workers);
  598. lace_newframe.t = NULL;
  599. #if LACE_PIE_TIMES
  600. // Initialize counters for pie times
  601. us_elapsed_start();
  602. count_at_start = gethrtime();
  603. #endif
  604. }
  605. void
  606. lace_startup(size_t stacksize, lace_startup_cb cb, void *arg)
  607. {
  608. if (stacksize == 0) stacksize = default_stacksize;
  609. if (verbosity) {
  610. if (cb != 0) {
  611. fprintf(stderr, "Lace startup, creating %d worker threads with program stack %zu bytes.\n", n_workers, stacksize);
  612. } else if (n_workers == 1) {
  613. fprintf(stderr, "Lace startup, creating 0 worker threads.\n");
  614. } else {
  615. fprintf(stderr, "Lace startup, creating %d worker threads with program stack %zu bytes.\n", n_workers-1, stacksize);
  616. }
  617. }
  618. /* Spawn workers */
  619. int i;
  620. for (i=1; i<n_workers; i++) lace_spawn_worker(i, stacksize, 0, 0);
  621. if (cb != 0) {
  622. main_cb = cb;
  623. lace_spawn_worker(0, stacksize, lace_main_wrapper, arg);
  624. // Suspend this thread until cb returns
  625. pthread_mutex_lock(&wait_until_done_mutex);
  626. pthread_cond_wait(&wait_until_done, &wait_until_done_mutex);
  627. pthread_mutex_unlock(&wait_until_done_mutex);
  628. } else {
  629. // use this thread as worker and return control
  630. lace_init_worker(0, 0);
  631. lace_time_event(lace_get_worker(), 1);
  632. }
  633. }
  634. #if LACE_COUNT_EVENTS
  635. static uint64_t ctr_all[CTR_MAX];
  636. #endif
  637. void
  638. lace_count_reset()
  639. {
  640. #if LACE_COUNT_EVENTS
  641. int i;
  642. size_t j;
  643. for (i=0;i<n_workers;i++) {
  644. for (j=0;j<CTR_MAX;j++) {
  645. workers_p[i]->ctr[j] = 0;
  646. }
  647. }
  648. #if LACE_PIE_TIMES
  649. for (i=0;i<n_workers;i++) {
  650. workers_p[i]->time = gethrtime();
  651. if (i != 0) workers_p[i]->level = 0;
  652. }
  653. us_elapsed_start();
  654. count_at_start = gethrtime();
  655. #endif
  656. #endif
  657. }
  658. void
  659. lace_count_report_file(FILE *file)
  660. {
  661. #if LACE_COUNT_EVENTS
  662. int i;
  663. size_t j;
  664. for (j=0;j<CTR_MAX;j++) ctr_all[j] = 0;
  665. for (i=0;i<n_workers;i++) {
  666. uint64_t *wctr = workers_p[i]->ctr;
  667. for (j=0;j<CTR_MAX;j++) {
  668. ctr_all[j] += wctr[j];
  669. }
  670. }
  671. #if LACE_COUNT_TASKS
  672. for (i=0;i<n_workers;i++) {
  673. fprintf(file, "Tasks (%d): %zu\n", i, workers_p[i]->ctr[CTR_tasks]);
  674. }
  675. fprintf(file, "Tasks (sum): %zu\n", ctr_all[CTR_tasks]);
  676. fprintf(file, "\n");
  677. #endif
  678. #if LACE_COUNT_STEALS
  679. for (i=0;i<n_workers;i++) {
  680. fprintf(file, "Steals (%d): %zu good/%zu busy of %zu tries; leaps: %zu good/%zu busy of %zu tries\n", i,
  681. workers_p[i]->ctr[CTR_steals], workers_p[i]->ctr[CTR_steal_busy],
  682. workers_p[i]->ctr[CTR_steal_tries], workers_p[i]->ctr[CTR_leaps],
  683. workers_p[i]->ctr[CTR_leap_busy], workers_p[i]->ctr[CTR_leap_tries]);
  684. }
  685. fprintf(file, "Steals (sum): %zu good/%zu busy of %zu tries; leaps: %zu good/%zu busy of %zu tries\n",
  686. ctr_all[CTR_steals], ctr_all[CTR_steal_busy],
  687. ctr_all[CTR_steal_tries], ctr_all[CTR_leaps],
  688. ctr_all[CTR_leap_busy], ctr_all[CTR_leap_tries]);
  689. fprintf(file, "\n");
  690. #endif
  691. #if LACE_COUNT_STEALS && LACE_COUNT_TASKS
  692. for (i=0;i<n_workers;i++) {
  693. fprintf(file, "Tasks per steal (%d): %zu\n", i,
  694. workers_p[i]->ctr[CTR_tasks]/(workers_p[i]->ctr[CTR_steals]+workers_p[i]->ctr[CTR_leaps]));
  695. }
  696. fprintf(file, "Tasks per steal (sum): %zu\n", ctr_all[CTR_tasks]/(ctr_all[CTR_steals]+ctr_all[CTR_leaps]));
  697. fprintf(file, "\n");
  698. #endif
  699. #if LACE_COUNT_SPLITS
  700. for (i=0;i<n_workers;i++) {
  701. fprintf(file, "Splits (%d): %zu shrinks, %zu grows, %zu outgoing requests\n", i,
  702. workers_p[i]->ctr[CTR_split_shrink], workers_p[i]->ctr[CTR_split_grow], workers_p[i]->ctr[CTR_split_req]);
  703. }
  704. fprintf(file, "Splits (sum): %zu shrinks, %zu grows, %zu outgoing requests\n",
  705. ctr_all[CTR_split_shrink], ctr_all[CTR_split_grow], ctr_all[CTR_split_req]);
  706. fprintf(file, "\n");
  707. #endif
  708. #if LACE_PIE_TIMES
  709. count_at_end = gethrtime();
  710. uint64_t count_per_ms = (count_at_end - count_at_start) / (us_elapsed() / 1000);
  711. double dcpm = (double)count_per_ms;
  712. uint64_t sum_count;
  713. sum_count = ctr_all[CTR_init] + ctr_all[CTR_wapp] + ctr_all[CTR_lapp] + ctr_all[CTR_wsteal] + ctr_all[CTR_lsteal]
  714. + ctr_all[CTR_close] + ctr_all[CTR_wstealsucc] + ctr_all[CTR_lstealsucc] + ctr_all[CTR_wsignal]
  715. + ctr_all[CTR_lsignal];
  716. fprintf(file, "Measured clock (tick) frequency: %.2f GHz\n", count_per_ms / 1000000.0);
  717. fprintf(file, "Aggregated time per pie slice, total time: %.2f CPU seconds\n\n", sum_count / (1000*dcpm));
  718. for (i=0;i<n_workers;i++) {
  719. fprintf(file, "Startup time (%d): %10.2f ms\n", i, workers_p[i]->ctr[CTR_init] / dcpm);
  720. fprintf(file, "Steal work (%d): %10.2f ms\n", i, workers_p[i]->ctr[CTR_wapp] / dcpm);
  721. fprintf(file, "Leap work (%d): %10.2f ms\n", i, workers_p[i]->ctr[CTR_lapp] / dcpm);
  722. fprintf(file, "Steal overhead (%d): %10.2f ms\n", i, (workers_p[i]->ctr[CTR_wstealsucc]+workers_p[i]->ctr[CTR_wsignal]) / dcpm);
  723. fprintf(file, "Leap overhead (%d): %10.2f ms\n", i, (workers_p[i]->ctr[CTR_lstealsucc]+workers_p[i]->ctr[CTR_lsignal]) / dcpm);
  724. fprintf(file, "Steal search (%d): %10.2f ms\n", i, (workers_p[i]->ctr[CTR_wsteal]-workers_p[i]->ctr[CTR_wstealsucc]-workers_p[i]->ctr[CTR_wsignal]) / dcpm);
  725. fprintf(file, "Leap search (%d): %10.2f ms\n", i, (workers_p[i]->ctr[CTR_lsteal]-workers_p[i]->ctr[CTR_lstealsucc]-workers_p[i]->ctr[CTR_lsignal]) / dcpm);
  726. fprintf(file, "Exit time (%d): %10.2f ms\n", i, workers_p[i]->ctr[CTR_close] / dcpm);
  727. fprintf(file, "\n");
  728. }
  729. fprintf(file, "Startup time (sum): %10.2f ms\n", ctr_all[CTR_init] / dcpm);
  730. fprintf(file, "Steal work (sum): %10.2f ms\n", ctr_all[CTR_wapp] / dcpm);
  731. fprintf(file, "Leap work (sum): %10.2f ms\n", ctr_all[CTR_lapp] / dcpm);
  732. fprintf(file, "Steal overhead (sum): %10.2f ms\n", (ctr_all[CTR_wstealsucc]+ctr_all[CTR_wsignal]) / dcpm);
  733. fprintf(file, "Leap overhead (sum): %10.2f ms\n", (ctr_all[CTR_lstealsucc]+ctr_all[CTR_lsignal]) / dcpm);
  734. fprintf(file, "Steal search (sum): %10.2f ms\n", (ctr_all[CTR_wsteal]-ctr_all[CTR_wstealsucc]-ctr_all[CTR_wsignal]) / dcpm);
  735. fprintf(file, "Leap search (sum): %10.2f ms\n", (ctr_all[CTR_lsteal]-ctr_all[CTR_lstealsucc]-ctr_all[CTR_lsignal]) / dcpm);
  736. fprintf(file, "Exit time (sum): %10.2f ms\n", ctr_all[CTR_close] / dcpm);
  737. fprintf(file, "\n" );
  738. #endif
  739. #endif
  740. return;
  741. (void)file;
  742. }
  743. void lace_exit()
  744. {
  745. lace_time_event(lace_get_worker(), 2);
  746. // first suspend all other threads
  747. lace_suspend();
  748. // now enable all threads and tell them to quit
  749. lace_set_workers(n_workers);
  750. lace_quits = 1;
  751. // now resume all threads and wait until they all pass the barrier
  752. lace_resume();
  753. lace_barrier();
  754. // finally, destroy the barriers
  755. lace_barrier_destroy();
  756. pthread_barrier_destroy(&suspend_barrier);
  757. #if LACE_COUNT_EVENTS
  758. lace_count_report_file(stderr);
  759. #endif
  760. }
  761. void
  762. lace_exec_in_new_frame(WorkerP *__lace_worker, Task *__lace_dq_head, Task *root)
  763. {
  764. TailSplit old;
  765. uint8_t old_as;
  766. // save old tail, split, allstolen and initiate new frame
  767. {
  768. Worker *wt = __lace_worker->_public;
  769. old_as = wt->allstolen;
  770. wt->allstolen = 1;
  771. old.ts.split = wt->ts.ts.split;
  772. wt->ts.ts.split = 0;
  773. mfence();
  774. old.ts.tail = wt->ts.ts.tail;
  775. TailSplit ts_new;
  776. ts_new.ts.tail = __lace_dq_head - __lace_worker->dq;
  777. ts_new.ts.split = __lace_dq_head - __lace_worker->dq;
  778. wt->ts.v = ts_new.v;
  779. __lace_worker->split = __lace_dq_head;
  780. __lace_worker->allstolen = 1;
  781. }
  782. // wait until all workers are ready
  783. lace_barrier();
  784. // execute task
  785. root->f(__lace_worker, __lace_dq_head, root);
  786. compiler_barrier();
  787. // wait until all workers are back (else they may steal from previous frame)
  788. lace_barrier();
  789. // restore tail, split, allstolen
  790. {
  791. Worker *wt = __lace_worker->_public;
  792. wt->allstolen = old_as;
  793. wt->ts.v = old.v;
  794. __lace_worker->split = __lace_worker->dq + old.ts.split;
  795. __lace_worker->allstolen = old_as;
  796. }
  797. }
  798. VOID_TASK_IMPL_2(lace_steal_loop_root, Task*, t, int*, done)
  799. {
  800. t->f(__lace_worker, __lace_dq_head, t);
  801. *done = 1;
  802. }
  803. VOID_TASK_2(lace_together_helper, Task*, t, volatile int*, finished)
  804. {
  805. t->f(__lace_worker, __lace_dq_head, t);
  806. for (;;) {
  807. int f = *finished;
  808. if (cas(finished, f, f-1)) break;
  809. }
  810. while (*finished != 0) STEAL_RANDOM();
  811. }
  812. static void
  813. lace_sync_and_exec(WorkerP *__lace_worker, Task *__lace_dq_head, Task *root)
  814. {
  815. // wait until other workers have made a local copy
  816. lace_barrier();
  817. // one worker sets t to 0 again
  818. if (LACE_WORKER_ID == 0) lace_newframe.t = 0;
  819. // else while (*(volatile Task**)&lace_newframe.t != 0) {}
  820. // the above line is commented out since lace_exec_in_new_frame includes
  821. // a lace_barrier before the task is executed
  822. lace_exec_in_new_frame(__lace_worker, __lace_dq_head, root);
  823. }
  824. void
  825. lace_yield(WorkerP *__lace_worker, Task *__lace_dq_head)
  826. {
  827. // make a local copy of the task
  828. Task _t;
  829. memcpy(&_t, lace_newframe.t, sizeof(Task));
  830. // wait until all workers have made a local copy
  831. lace_barrier();
  832. // one worker sets t to 0 again
  833. if (LACE_WORKER_ID == 0) lace_newframe.t = 0;
  834. // else while (*(volatile Task**)&lace_newframe.t != 0) {}
  835. // the above line is commented out since lace_exec_in_new_frame includes
  836. // a lace_barrier before the task is executed
  837. lace_exec_in_new_frame(__lace_worker, __lace_dq_head, &_t);
  838. }
  839. void
  840. lace_do_together(WorkerP *__lace_worker, Task *__lace_dq_head, Task *t)
  841. {
  842. /* synchronization integer */
  843. int done = n_workers;
  844. /* wrap task in lace_together_helper */
  845. Task _t2;
  846. TD_lace_together_helper *t2 = (TD_lace_together_helper *)&_t2;
  847. t2->f = lace_together_helper_WRAP;
  848. t2->thief = THIEF_TASK;
  849. t2->d.args.arg_1 = t;
  850. t2->d.args.arg_2 = &done;
  851. while (!cas(&lace_newframe.t, 0, &_t2)) lace_yield(__lace_worker, __lace_dq_head);
  852. lace_sync_and_exec(__lace_worker, __lace_dq_head, &_t2);
  853. }
  854. void
  855. lace_do_newframe(WorkerP *__lace_worker, Task *__lace_dq_head, Task *t)
  856. {
  857. /* synchronization integer */
  858. int done = 0;
  859. /* wrap task in lace_steal_loop_root */
  860. Task _t2;
  861. TD_lace_steal_loop_root *t2 = (TD_lace_steal_loop_root *)&_t2;
  862. t2->f = lace_steal_loop_root_WRAP;
  863. t2->thief = THIEF_TASK;
  864. t2->d.args.arg_1 = t;
  865. t2->d.args.arg_2 = &done;
  866. /* and create the lace_steal_loop task for other workers */
  867. Task _s;
  868. TD_lace_steal_loop *s = (TD_lace_steal_loop *)&_s;
  869. s->f = &lace_steal_loop_WRAP;
  870. s->thief = THIEF_TASK;
  871. s->d.args.arg_1 = &done;
  872. compiler_barrier();
  873. while (!cas(&lace_newframe.t, 0, &_s)) lace_yield(__lace_worker, __lace_dq_head);
  874. lace_sync_and_exec(__lace_worker, __lace_dq_head, &_t2);
  875. }