mirror of https://github.com/KLayout/klayout.git
Parallelize hierarchical DRC processing with OpenMP
- Replace sequential task queues in dbHierProcessor and dbCompoundOperation with OpenMP tasking and parallel loops - Add thread safety to CompoundRegionOperationCache using tl::Mutex - Enable OpenMP compiler and linker flags in klayout.pri - Update _drc_engine.rb documentation to explicitly state thread allocation for hierarchical mode
This commit is contained in:
parent
6270877110
commit
197aad9c89
7
build.sh
7
build.sh
|
|
@ -42,6 +42,7 @@ HAVE_EXPAT=0
|
|||
HAVE_GIT2=1
|
||||
HAVE_LSTREAM=1
|
||||
HAVE_CPP20=0
|
||||
HAVE_OPENMP=0
|
||||
|
||||
RUBYINCLUDE=""
|
||||
RUBYINCLUDE2=""
|
||||
|
|
@ -103,6 +104,9 @@ while [ "$*" != "" ]; do
|
|||
-without-qtbinding)
|
||||
HAVE_QTBINDINGS=0
|
||||
;;
|
||||
-with-openmp)
|
||||
HAVE_OPENMP=1
|
||||
;;
|
||||
-without-qt-uitools)
|
||||
HAVE_QT_UITOOLS=0
|
||||
;;
|
||||
|
|
@ -255,6 +259,7 @@ while [ "$*" != "" ]; do
|
|||
echo " -with-qtbinding Create Qt bindings for ruby scripts [default]"
|
||||
echo " -without-qtbinding Don't create Qt bindings for ruby scripts"
|
||||
echo " -without-qt-uitools Don't include uitools in Qt binding"
|
||||
echo " -with-openmp Enable OpenMP parallelization for hierarchical processing"
|
||||
echo " -with-64bit-coord Use long (64bit) coordinates - EXPERIMENTAL FEATURE"
|
||||
echo " (only available for gcc>=4.4 for 64bit build)"
|
||||
echo " -without-64bit-coord Don't use long (64bit) coordinates [default]"
|
||||
|
|
@ -601,6 +606,7 @@ echo " HAVE_PNG=$HAVE_PNG"
|
|||
echo " HAVE_EXPAT=$HAVE_EXPAT"
|
||||
echo " HAVE_GIT2=$HAVE_GIT2"
|
||||
echo " HAVE_LSTREAM=$HAVE_LSTREAM"
|
||||
echo " HAVE_OPENMP=$HAVE_OPENMP"
|
||||
echo " RPATH=$RPATH"
|
||||
|
||||
mkdir -p $BUILD
|
||||
|
|
@ -676,6 +682,7 @@ qmake_options=(
|
|||
HAVE_GIT2="$HAVE_GIT2"
|
||||
HAVE_LSTREAM="$HAVE_LSTREAM"
|
||||
HAVE_CPP20="$HAVE_CPP20"
|
||||
HAVE_OPENMP="$HAVE_OPENMP"
|
||||
PREFIX="$BIN"
|
||||
RPATH="$RPATH"
|
||||
KLAYOUT_VERSION="$KLAYOUT_VERSION"
|
||||
|
|
|
|||
|
|
@ -766,46 +766,52 @@ CompoundRegionGeometricalBoolOperationNode::implement_bool (CompoundRegionOperat
|
|||
one_a.push_back (std::unordered_set<T1> ());
|
||||
|
||||
shape_interactions<T, T> computed_a;
|
||||
child (0)->compute_local (cache, layout, cell, interactions_for_child (interactions, 0, computed_a), one_a, proc);
|
||||
|
||||
std::vector<std::unordered_set<T2> > one_b;
|
||||
one_b.push_back (std::unordered_set<T2> ());
|
||||
|
||||
shape_interactions<T, T> computed_b;
|
||||
|
||||
bool can_parallel = (m_op != GeometricalOp::And && m_op != GeometricalOp::Not);
|
||||
|
||||
#if defined(_OPENMP)
|
||||
if (can_parallel && proc->threads() > 0) {
|
||||
#pragma omp task shared(one_a, computed_a, cache, layout, cell, interactions, proc)
|
||||
{
|
||||
child (0)->compute_local (cache, layout, cell, interactions_for_child (interactions, 0, computed_a), one_a, proc);
|
||||
}
|
||||
#pragma omp task shared(one_b, computed_b, cache, layout, cell, interactions, proc)
|
||||
{
|
||||
child (1)->compute_local (cache, layout, cell, interactions_for_child (interactions, 1, computed_b), one_b, proc);
|
||||
}
|
||||
#pragma omp taskwait
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
child (0)->compute_local (cache, layout, cell, interactions_for_child (interactions, 0, computed_a), one_a, proc);
|
||||
if (!one_a.front().empty()) {
|
||||
child (1)->compute_local (cache, layout, cell, interactions_for_child (interactions, 1, computed_b), one_b, proc);
|
||||
} else {
|
||||
if (!can_parallel) { // And or Not and A is empty
|
||||
return; // nothing to do, results remain empty
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (one_a.front ().empty ()) {
|
||||
|
||||
if (m_op == GeometricalOp::And || m_op == GeometricalOp::Not) {
|
||||
|
||||
if (!can_parallel) {
|
||||
// .. no results ..
|
||||
|
||||
} else {
|
||||
|
||||
std::vector<std::unordered_set<T2> > one_b;
|
||||
one_b.push_back (std::unordered_set<T2> ());
|
||||
|
||||
shape_interactions<T, T> computed_b;
|
||||
child (1)->compute_local (cache, layout, cell, interactions_for_child (interactions, 1, computed_b), one_b, proc);
|
||||
|
||||
copy_results (results, one_b);
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
std::vector<std::unordered_set<T2> > one_b;
|
||||
one_b.push_back (std::unordered_set<T2> ());
|
||||
|
||||
shape_interactions<T, T> computed_b;
|
||||
child (1)->compute_local (cache, layout, cell, interactions_for_child (interactions, 1, computed_b), one_b, proc);
|
||||
|
||||
if (one_b.front ().empty ()) {
|
||||
|
||||
if (m_op != GeometricalOp::And) {
|
||||
copy_results (results, one_a);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
run_bool (m_op, layout, one_a.front (), one_b.front (), results.front ());
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -934,30 +940,54 @@ void compound_region_generic_operation_node<TS, TI, TR>::implement_compute_local
|
|||
shape_interactions<TTS, TTI> self_interactions_heap;
|
||||
const shape_interactions<TTS, TTI> &self_interactions = interactions_for_child (interactions, 0, self_interactions_heap);
|
||||
|
||||
self->compute_local (cache, layout, cell, self_interactions, self_result, proc);
|
||||
std::vector<db::generic_shape_iterator<TI> > iiv;
|
||||
std::vector<std::unordered_set<TI> > intruder_results;
|
||||
intruder_results.resize (children () - 1); // allocate memory upfront
|
||||
|
||||
#if defined(_OPENMP)
|
||||
if (proc->threads() > 0) {
|
||||
#pragma omp task shared(self_result, self_interactions_heap, cache, layout, cell, interactions, proc)
|
||||
{
|
||||
self->compute_local (cache, layout, cell, self_interactions, self_result, proc);
|
||||
}
|
||||
for (unsigned int ci = 1; ci < children (); ++ci) {
|
||||
#pragma omp task shared(intruder_results, cache, layout, cell, interactions, proc) firstprivate(ci)
|
||||
{
|
||||
const CompoundRegionOperationNode *intruder = child (ci);
|
||||
std::vector<std::unordered_set<TI> > intruder_result;
|
||||
intruder_result.push_back (std::unordered_set<TI> ());
|
||||
|
||||
shape_interactions<TTS, TTI> intruder_interactions_heap;
|
||||
const shape_interactions<TTS, TTI> &intruder_interactions = interactions_for_child (interactions, ci, intruder_interactions_heap);
|
||||
|
||||
intruder->compute_local (cache, layout, cell, intruder_interactions, intruder_result, proc);
|
||||
intruder_results[ci - 1] = std::move(intruder_result.front());
|
||||
}
|
||||
}
|
||||
#pragma omp taskwait
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
self->compute_local (cache, layout, cell, self_interactions, self_result, proc);
|
||||
|
||||
for (unsigned int ci = 1; ci < children (); ++ci) {
|
||||
|
||||
const CompoundRegionOperationNode *intruder = child (ci);
|
||||
std::vector<std::unordered_set<TI> > intruder_result;
|
||||
intruder_result.push_back (std::unordered_set<TI> ());
|
||||
|
||||
shape_interactions<TTS, TTI> intruder_interactions_heap;
|
||||
const shape_interactions<TTS, TTI> &intruder_interactions = interactions_for_child (interactions, ci, intruder_interactions_heap);
|
||||
|
||||
intruder->compute_local (cache, layout, cell, intruder_interactions, intruder_result, proc);
|
||||
intruder_results[ci - 1] = std::move(intruder_result.front());
|
||||
}
|
||||
}
|
||||
|
||||
db::generic_shape_iterator <TS> is (self_result.front ().begin (), self_result.front ().end ());
|
||||
|
||||
std::vector<db::generic_shape_iterator<TI> > iiv;
|
||||
std::vector<std::unordered_set<TI> > intruder_results;
|
||||
intruder_results.reserve (children () - 1); // important, so that the memory layout will not change while we generate them
|
||||
|
||||
for (unsigned int ci = 1; ci < children (); ++ci) {
|
||||
|
||||
const CompoundRegionOperationNode *intruder = child (ci);
|
||||
std::vector<std::unordered_set<TI> > intruder_result;
|
||||
intruder_result.push_back (std::unordered_set<TI> ());
|
||||
|
||||
shape_interactions<TTS, TTI> intruder_interactions_heap;
|
||||
const shape_interactions<TTS, TTI> &intruder_interactions = interactions_for_child (interactions, ci, intruder_interactions_heap);
|
||||
|
||||
intruder->compute_local (cache, layout, cell, intruder_interactions, intruder_result, proc);
|
||||
|
||||
intruder_results.push_back (std::unordered_set<TI> ());
|
||||
intruder_results.back ().swap (intruder_result.front ());
|
||||
|
||||
iiv.push_back (db::generic_shape_iterator<TI> (intruder_results.back ().begin (), intruder_results.back ().end ()));
|
||||
|
||||
iiv.push_back (db::generic_shape_iterator <TI> (intruder_results[ci - 1].begin (), intruder_results[ci - 1].end ()));
|
||||
}
|
||||
|
||||
db::local_processor <TS, TI, TR> lproc (layout);
|
||||
|
|
|
|||
|
|
@ -57,12 +57,15 @@ class CompoundRegionOperationNode;
|
|||
* This cache is important to avoid duplicate evaluation of the same node in
|
||||
* a diamond-graph structure of nodes.
|
||||
*/
|
||||
#include "tlThreads.h"
|
||||
|
||||
class DB_PUBLIC CompoundRegionOperationCache
|
||||
{
|
||||
public:
|
||||
template <class TR>
|
||||
std::pair<bool, std::vector<std::unordered_set<TR> > *> get (const CompoundRegionOperationNode *node)
|
||||
{
|
||||
tl::MutexLocker lock (&m_mutex);
|
||||
bool valid = false;
|
||||
std::vector<std::unordered_set<TR> > *cache = 0;
|
||||
get_cache (cache, valid, node);
|
||||
|
|
@ -70,6 +73,7 @@ public:
|
|||
}
|
||||
|
||||
private:
|
||||
tl::Mutex m_mutex;
|
||||
std::map<const CompoundRegionOperationNode *, std::vector<std::unordered_set<db::PolygonRefWithProperties> > > m_cache_polyref_wp;
|
||||
std::map<const CompoundRegionOperationNode *, std::vector<std::unordered_set<db::PolygonWithProperties> > > m_cache_poly_wp;
|
||||
std::map<const CompoundRegionOperationNode *, std::vector<std::unordered_set<db::EdgeWithProperties> > > m_cache_edge_wp;
|
||||
|
|
|
|||
|
|
@ -890,18 +890,41 @@ void local_processor<TS, TI, TR>::compute_contexts (local_processor_contexts<TS,
|
|||
|
||||
tl::SelfTimer timer (tl::verbosity () > base_verbosity () + 10, tl::to_string (tr ("Computing contexts for ")) + description (op));
|
||||
|
||||
#if defined(_OPENMP)
|
||||
if (threads () > 0) {
|
||||
mp_cc_job.reset (0);
|
||||
} else {
|
||||
mp_cc_job.reset (0);
|
||||
}
|
||||
#else
|
||||
if (threads () > 0) {
|
||||
mp_cc_job.reset (new tl::Job<local_processor_context_computation_worker<TS, TI, TR> > (threads ()));
|
||||
} else {
|
||||
mp_cc_job.reset (0);
|
||||
}
|
||||
#endif
|
||||
|
||||
contexts.clear ();
|
||||
contexts.set_intruder_layers (intruder_layers);
|
||||
contexts.set_subject_layer (subject_layer);
|
||||
|
||||
typename local_processor_cell_contexts<TS, TI, TR>::context_key_type intruders;
|
||||
#if defined(_OPENMP)
|
||||
if (threads() > 0) {
|
||||
int nthreads = threads();
|
||||
#pragma omp parallel num_threads(nthreads) shared(contexts, intruders)
|
||||
{
|
||||
#pragma omp single
|
||||
{
|
||||
issue_compute_contexts (contexts, 0, 0, mp_subject_top, db::ICplxTrans (), mp_intruder_top, intruders, op->dist ());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
issue_compute_contexts (contexts, 0, 0, mp_subject_top, db::ICplxTrans (), mp_intruder_top, intruders, op->dist ());
|
||||
}
|
||||
#else
|
||||
issue_compute_contexts (contexts, 0, 0, mp_subject_top, db::ICplxTrans (), mp_intruder_top, intruders, op->dist ());
|
||||
#endif
|
||||
|
||||
if (mp_cc_job.get ()) {
|
||||
mp_cc_job->start ();
|
||||
|
|
@ -926,11 +949,24 @@ void local_processor<TS, TI, TR>::issue_compute_contexts (local_processor_contex
|
|||
{
|
||||
bool is_small_job = subject_cell->begin ().at_end ();
|
||||
|
||||
#if defined(_OPENMP)
|
||||
if (! is_small_job && threads() > 0) {
|
||||
typename local_processor_cell_contexts<TS, TI, TR>::context_key_type my_intruders;
|
||||
my_intruders.swap (intruders);
|
||||
#pragma omp task shared(contexts) firstprivate(parent_context, subject_parent, subject_cell, subject_cell_inst, intruder_cell, my_intruders, dist)
|
||||
{
|
||||
compute_contexts (contexts, parent_context, subject_parent, subject_cell, subject_cell_inst, intruder_cell, my_intruders, dist);
|
||||
}
|
||||
} else {
|
||||
compute_contexts (contexts, parent_context, subject_parent, subject_cell, subject_cell_inst, intruder_cell, intruders, dist);
|
||||
}
|
||||
#else
|
||||
if (! is_small_job && mp_cc_job.get ()) {
|
||||
mp_cc_job->schedule (new local_processor_context_computation_task<TS, TI, TR> (this, contexts, parent_context, subject_parent, subject_cell, subject_cell_inst, intruder_cell, intruders, dist));
|
||||
} else {
|
||||
compute_contexts (contexts, parent_context, subject_parent, subject_cell, subject_cell_inst, intruder_cell, intruders, dist);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class TS, class TI, class TR>
|
||||
|
|
@ -1164,8 +1200,6 @@ local_processor<TS, TI, TR>::compute_results (local_processor_contexts<TS, TI, T
|
|||
|
||||
if (threads () > 0) {
|
||||
|
||||
std::unique_ptr<tl::Job<local_processor_result_computation_worker<TS, TI, TR> > > rc_job (new tl::Job<local_processor_result_computation_worker<TS, TI, TR> > (threads ()));
|
||||
|
||||
// schedule computation jobs in "waves": we need to make sure they are executed
|
||||
// bottom-up. So we identify a new bunch of cells each time we pass through the cell set
|
||||
// and proceed until all cells are removed.
|
||||
|
|
@ -1188,6 +1222,8 @@ local_processor<TS, TI, TR>::compute_results (local_processor_contexts<TS, TI, T
|
|||
std::vector<db::cell_index_type> next_cells_bu;
|
||||
next_cells_bu.reserve (cells_bu.size ());
|
||||
|
||||
std::vector<local_processor_result_computation_task<TS, TI, TR>*> tasks;
|
||||
|
||||
for (std::vector<db::cell_index_type>::const_iterator bu = cells_bu.begin (); bu != cells_bu.end (); ++bu) {
|
||||
|
||||
tl::MutexLocker locker (& contexts.lock ());
|
||||
|
|
@ -1197,7 +1233,7 @@ local_processor<TS, TI, TR>::compute_results (local_processor_contexts<TS, TI, T
|
|||
|
||||
if (later.find (*bu) == later.end ()) {
|
||||
|
||||
rc_job->schedule (new local_processor_result_computation_task<TS, TI, TR> (this, contexts, cpc->first, &cpc->second, op, output_layers));
|
||||
tasks.push_back(new local_processor_result_computation_task<TS, TI, TR> (this, contexts, cpc->first, &cpc->second, op, output_layers));
|
||||
any = true;
|
||||
|
||||
} else {
|
||||
|
|
@ -1218,20 +1254,37 @@ local_processor<TS, TI, TR>::compute_results (local_processor_contexts<TS, TI, T
|
|||
break;
|
||||
}
|
||||
|
||||
if (rc_job.get ()) {
|
||||
|
||||
if (!tasks.empty()) {
|
||||
try {
|
||||
|
||||
rc_job->start ();
|
||||
#if defined(_OPENMP)
|
||||
int nthreads = threads();
|
||||
#pragma omp parallel for num_threads(nthreads) schedule(dynamic)
|
||||
for (long long i = 0; i < (long long)tasks.size(); ++i) {
|
||||
tasks[i]->perform();
|
||||
}
|
||||
#else
|
||||
std::unique_ptr<tl::Job<local_processor_result_computation_worker<TS, TI, TR> > > rc_job (new tl::Job<local_processor_result_computation_worker<TS, TI, TR> > (threads ()));
|
||||
for (size_t i = 0; i < tasks.size(); ++i) {
|
||||
rc_job->schedule(tasks[i]);
|
||||
}
|
||||
rc_job->start();
|
||||
while (! rc_job->wait (10)) {
|
||||
progress.set (get_progress ());
|
||||
}
|
||||
|
||||
#endif
|
||||
} catch (...) {
|
||||
rc_job->terminate ();
|
||||
#if !defined(_OPENMP)
|
||||
// rc_job cleanup will be handled by the smart pointer, but we don't have it explicitly throwing here in openmp mode
|
||||
#endif
|
||||
for (size_t i = 0; i < tasks.size(); ++i) { delete tasks[i]; }
|
||||
throw;
|
||||
}
|
||||
|
||||
#if defined(_OPENMP)
|
||||
for (size_t i = 0; i < tasks.size(); ++i) {
|
||||
delete tasks[i];
|
||||
}
|
||||
progress.set(get_progress());
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1238,11 +1238,11 @@ module DRC
|
|||
|
||||
# %DRC%
|
||||
# @name threads
|
||||
# @brief Specifies the number of CPU cores to use in tiling mode
|
||||
# @brief Specifies the number of CPU cores to use in tiling and hierarchical mode
|
||||
# @synopsis threads(n)
|
||||
# @synopsis threads
|
||||
# If using threads, tiles are distributed on multiple CPU cores for
|
||||
# parallelization. Still, all tiles must be processed before the
|
||||
# If using threads, tiles or hierarchical cells are distributed on multiple CPU cores for
|
||||
# parallelization. Still, all tiles or cells must be processed before the
|
||||
# operation proceeds with the next statement.
|
||||
#
|
||||
# Without an argument, "threads" will return the current number of
|
||||
|
|
|
|||
|
|
@ -216,6 +216,20 @@ msvc {
|
|||
}
|
||||
}
|
||||
|
||||
equals(HAVE_OPENMP, "1") {
|
||||
msvc {
|
||||
QMAKE_CXXFLAGS += /openmp
|
||||
QMAKE_LFLAGS += /openmp
|
||||
} else:macx {
|
||||
QMAKE_CXXFLAGS += -Xpreprocessor -fopenmp
|
||||
LIBS += -lomp
|
||||
} else {
|
||||
QMAKE_CXXFLAGS += -fopenmp
|
||||
QMAKE_LFLAGS += -fopenmp
|
||||
}
|
||||
DEFINES += _OPENMP
|
||||
}
|
||||
|
||||
win32 {
|
||||
|
||||
QMAKE_LFLAGS += -Wl,--exclude-all-symbols
|
||||
|
|
|
|||
Loading…
Reference in New Issue