iverilog/vvp/vthread.cc

/*
 * Copyright (c) 2001-2016 Stephen Williams (steve@icarus.com)
 *
 *    This source code is free software; you can redistribute it
 *    and/or modify it in source code form under the terms of the GNU
 *    General Public License as published by the Free Software
 *    Foundation; either version 2 of the License, or (at your option)
 *    any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 */

# include  "config.h"
# include  "vthread.h"
# include  "codes.h"
# include  "schedule.h"
# include  "ufunc.h"
# include  "event.h"
# include  "vpi_priv.h"
# include  "vvp_net_sig.h"
# include  "vvp_cobject.h"
# include  "vvp_darray.h"
# include  "class_type.h"
#ifdef CHECK_WITH_VALGRIND
# include  "vvp_cleanup.h"
#endif
# include  <set>
# include  <typeinfo>
# include  <vector>
# include  <cstdlib>
# include  <climits>
# include  <cstring>
# include  <cmath>
# include  <cassert>

# include  <iostream>
# include  <cstdio>

using namespace std;

/* This is the size of an unsigned long in bits. This is just a
   convenience macro. */
# define CPU_WORD_BITS (8*sizeof(unsigned long))
# define TOP_BIT (1UL << (CPU_WORD_BITS-1))

/*
 * This vthread_s structure describes all there is to know about a
 * thread, including its program counter, all the private bits it
 * holds, and its place in other lists.
 *
 *
 * ** Notes On The Interactions of %fork/%join/%end:
 *
 * The %fork instruction creates a new thread and pushes that into a
 * set of children for the thread. This new thread, then, becomes a
 * child of the current thread, and the current thread a parent of the
 * new thread. Any child can be reaped by a %join.
 *
 * Children that are detached with %join/detach need to have a different
 * parent/child relationship since the parent can still effect them if
 * it uses the %disable/fork or %wait/fork opcodes. The i_am_detached
 * flag and detached_children set are used for this relationship.
 *
 * Children placed into a task or function scope are given special
 * treatment, which is required to make task/function calls that they
 * represent work correctly. These task/function children are copied
 * into a task_func_children set to mark them for this handling. %join
 * operations will guarantee that task/function threads are joined first,
 * before any non-task/function threads.
 *
 * It is a programming error for a thread that created threads to not
 * %join (or %join/detach) as many as it created before it %ends. The
 * children set will get messed up otherwise.
 *
 * the i_am_joining flag is a clue to children that the parent is
 * blocked in a %join and may need to be scheduled. The %end
 * instruction will check this flag in the parent to see if it should
 * notify the parent that something is interesting.
 *
 * The i_have_ended flag, on the other hand, is used by threads to
 * tell their parents that they are already dead. A thread that
 * executes %end will set its own i_have_ended flag and let its parent
 * reap it when the parent does the %join. If a thread has its
 * schedule_parent_on_end flag set already when it %ends, then it
 * reaps itself and simply schedules its parent. If a child has its
 * i_have_ended flag set when a thread executes %join, then it is free
 * to reap the child immediately.
 */

struct vthread_s {
      vthread_s();

      void debug_dump(ostream&fd, const char*label_text);

	/* This is the program counter. */
      vvp_code_t pc;
	/* These hold the private thread bits. */
      enum { FLAGS_COUNT = 256, WORDS_COUNT = 16 };
      vvp_bit4_t flags[FLAGS_COUNT];

	/* These are the word registers. */
      union {
	    int64_t  w_int;
	    uint64_t w_uint;
      } words[WORDS_COUNT];

	// These vectors are depths within the parent thread's
	// corresponding stack.  This is how the %ret/* instructions
	// get at parent thread arguments.
      vector<unsigned> args_real;
      vector<unsigned> args_str;
      vector<unsigned> args_vec4;

    private:
      vector<vvp_vector4_t>stack_vec4_;
    public:
      inline vvp_vector4_t pop_vec4(void)
      {
	    assert(! stack_vec4_.empty());
	    vvp_vector4_t val = stack_vec4_.back();
	    stack_vec4_.pop_back();
	    return val;
      }
      inline void push_vec4(const vvp_vector4_t&val)
      {
	    stack_vec4_.push_back(val);
      }
      inline const vvp_vector4_t& peek_vec4(unsigned depth)
      {
	    unsigned size = stack_vec4_.size();
	    assert(depth < size);
	    unsigned use_index = size-1-depth;
	    return stack_vec4_[use_index];
      }
      inline vvp_vector4_t& peek_vec4(void)
      {
	    unsigned use_index = stack_vec4_.size();
	    assert(use_index >= 1);
	    return stack_vec4_[use_index-1];
      }
      inline void poke_vec4(unsigned depth, const vvp_vector4_t&val)
      {
	    assert(depth < stack_vec4_.size());
	    unsigned use_index = stack_vec4_.size()-1-depth;
	    stack_vec4_[use_index] = val;
      }
      inline void pop_vec4(unsigned cnt)
      {
	    while (cnt > 0) {
		  stack_vec4_.pop_back();
		  cnt -= 1;
	    }
      }


    private:
      vector<double> stack_real_;
    public:
      inline double pop_real(void)
      {
	    assert(! stack_real_.empty());
	    double val = stack_real_.back();
	    stack_real_.pop_back();
	    return val;
      }
      inline void push_real(double val)
      {
	    stack_real_.push_back(val);
      }
      inline double peek_real(unsigned depth)
      {
	    assert(depth < stack_real_.size());
	    unsigned use_index = stack_real_.size()-1-depth;
	    return stack_real_[use_index];
      }
      inline void poke_real(unsigned depth, double val)
      {
	    assert(depth < stack_real_.size());
	    unsigned use_index = stack_real_.size()-1-depth;
	    stack_real_[use_index] = val;
      }
      inline void pop_real(unsigned cnt)
      {
	    while (cnt > 0) {
		  stack_real_.pop_back();
		  cnt -= 1;
	    }
      }

	/* Strings are operated on using a forth-like operator
	   set. Items at the top of the stack (back()) are the objects
	   operated on except for special cases. New objects are
	   pushed onto the top (back()) and pulled from the top
	   (back()) only. */
    private:
      vector<string> stack_str_;
    public:
      inline string pop_str(void)
      {
	    assert(! stack_str_.empty());
	    string val = stack_str_.back();
	    stack_str_.pop_back();
	    return val;
      }
      inline void push_str(const string&val)
      {
	    stack_str_.push_back(val);
      }
      inline string&peek_str(unsigned depth)
      {
	    assert(depth<stack_str_.size());
	    unsigned use_index = stack_str_.size()-1-depth;
	    return stack_str_[use_index];
      }
      inline void poke_str(unsigned depth, const string&val)
      {
	    assert(depth < stack_str_.size());
	    unsigned use_index = stack_str_.size()-1-depth;
	    stack_str_[use_index] = val;
      }
      inline void pop_str(unsigned cnt)
      {
	    while (cnt > 0) {
		  stack_str_.pop_back();
		  cnt -= 1;
	    }
      }

	/* Objects are also operated on in a stack. */
    private:
      enum { STACK_OBJ_MAX_SIZE = 32 };
      vvp_object_t stack_obj_[STACK_OBJ_MAX_SIZE];
      unsigned stack_obj_size_;
    public:
      inline vvp_object_t& peek_object(void)
      {
	    assert(stack_obj_size_ > 0);
	    return stack_obj_[stack_obj_size_-1];
      }
      inline void pop_object(vvp_object_t&obj)
      {
	    assert(stack_obj_size_ > 0);
	    stack_obj_size_ -= 1;
	    obj = stack_obj_[stack_obj_size_];
	    stack_obj_[stack_obj_size_].reset(0);
      }
      inline void pop_object(unsigned cnt, unsigned skip =0)
      {
	    assert((cnt+skip) <= stack_obj_size_);
	    for (size_t idx = stack_obj_size_-skip-cnt ; idx < stack_obj_size_-skip ; idx += 1)
		  stack_obj_[idx].reset(0);
	    stack_obj_size_ -= cnt;
	    for (size_t idx = stack_obj_size_-skip ; idx < stack_obj_size_ ; idx += 1)
		  stack_obj_[idx] = stack_obj_[idx+skip];
	    for (size_t idx = stack_obj_size_ ; idx < stack_obj_size_+skip ; idx += 1)
		  stack_obj_[idx].reset(0);
      }
      inline void push_object(const vvp_object_t&obj)
      {
	    assert(stack_obj_size_ < STACK_OBJ_MAX_SIZE);
	    stack_obj_[stack_obj_size_] = obj;
	    stack_obj_size_ += 1;
      }

	/* My parent sets this when it wants me to wake it up. */
      unsigned i_am_joining      :1;
      unsigned i_am_detached     :1;
      unsigned i_am_waiting      :1;
      unsigned i_am_in_function  :1; // True if running function code
      unsigned i_have_ended      :1;
      unsigned i_was_disabled    :1;
      unsigned waiting_for_event :1;
      unsigned is_scheduled      :1;
      unsigned delay_delete      :1;
	/* This points to the children of the thread. */
      set<struct vthread_s*>children;
	/* This points to the detached children of the thread. */
      set<struct vthread_s*>detached_children;
	/* No more than 1 of the children are tasks or functions. */
      set<vthread_s*>task_func_children;
	/* This points to my parent, if I have one. */
      struct vthread_s*parent;
	/* This points to the containing scope. */
      __vpiScope*parent_scope;
	/* This is used for keeping wait queues. */
      struct vthread_s*wait_next;
	/* These are used to access automatically allocated items. */
      vvp_context_t wt_context, rd_context;
	/* These are used to pass non-blocking event control information. */
      vvp_net_t*event;
      uint64_t ecount;

      inline void cleanup()
      {
	    if (i_was_disabled) {
		  stack_vec4_.clear();
		  stack_real_.clear();
		  stack_str_.clear();
		  pop_object(stack_obj_size_);
	    }
	    assert(stack_vec4_.empty());
	    assert(stack_real_.empty());
	    assert(stack_str_.empty());
	    assert(stack_obj_size_ == 0);
      }
};

inline vthread_s::vthread_s()
{
      stack_obj_size_ = 0;
}

void vthread_s::debug_dump(ostream&fd, const char*label)
{
      fd << "**** " << label << endl;
      fd << "**** ThreadId: " << this << ", parent id: " << parent << endl;

      fd << "**** Flags: ";
      for (int idx = 0 ; idx < FLAGS_COUNT ; idx += 1)
	    fd << flags[idx];
      fd << endl;
      fd << "**** vec4 stack..." << endl;
      for (size_t idx = stack_vec4_.size() ; idx > 0 ; idx -= 1)
	    fd << "    " << (stack_vec4_.size()-idx) << ": " << stack_vec4_[idx-1] << endl;
      fd << "**** str stack (" << stack_str_.size() << ")..." << endl;
      fd << "**** obj stack (" << stack_obj_size_ << ")..." << endl;
      fd << "**** args_vec4 array (" << args_vec4.size() << ")..." << endl;
      for (size_t idx = 0 ; idx < args_vec4.size() ; idx += 1)
	    fd << "    " << idx << ": " << args_vec4[idx] << endl;
      fd << "**** Done ****" << endl;
}

static bool test_joinable(vthread_t thr, vthread_t child);
static void do_join(vthread_t thr, vthread_t child);

__vpiScope* vthread_scope(struct vthread_s*thr)
{
      return thr->parent_scope;
}

struct vthread_s*running_thread = 0;


void vthread_push_vec4(struct vthread_s*thr, const vvp_vector4_t&val)
{
      thr->push_vec4(val);
}

void vthread_push_real(struct vthread_s*thr, double val)
{
      thr->push_real(val);
}

void vthread_push_str(struct vthread_s*thr, const string&val)
{
      thr->push_str(val);
}

void vthread_pop_vec4(struct vthread_s*thr, unsigned depth)
{
      thr->pop_vec4(depth);
}

void vthread_pop_real(struct vthread_s*thr, unsigned depth)
{
      thr->pop_real(depth);
}

void vthread_pop_str(struct vthread_s*thr, unsigned depth)
{
      thr->pop_str(depth);
}

const string&vthread_get_str_stack(struct vthread_s*thr, unsigned depth)
{
      return thr->peek_str(depth);
}

double vthread_get_real_stack(struct vthread_s*thr, unsigned depth)
{
      return thr->peek_real(depth);
}

const vvp_vector4_t& vthread_get_vec4_stack(struct vthread_s*thr, unsigned depth)
{
      return thr->peek_vec4(depth);
}

/*
 * Some thread management functions
 */
/*
 * This is a function to get a vvp_queue handle from the variable
 * referenced by "net". If the queue is nil, then allocated it and
 * assign the value to the net. Note that this function is
 * parameterized by the queue type so that we can create the right
 * derived type of queue object.
 */
template <class VVP_QUEUE> static vvp_queue*get_queue_object(vthread_t thr, vvp_net_t*net)
{
      vvp_fun_signal_object*obj = dynamic_cast<vvp_fun_signal_object*> (net->fun);
      assert(obj);

      vvp_queue*dqueue = obj->get_object().peek<vvp_queue>();
      if (dqueue == 0) {
	    assert(obj->get_object().test_nil());
	    dqueue = new VVP_QUEUE;
	    vvp_object_t val (dqueue);
	    vvp_net_ptr_t ptr (net, 0);
	    vvp_send_object(ptr, val, thr->wt_context);
      }

      return dqueue;
}

template <class T> T coerce_to_width(const T&that, unsigned width)
{
      if (that.size() == width)
	    return that;

      assert(that.size() > width);
      T res (width);
      for (unsigned idx = 0 ;  idx < width ;  idx += 1)
	    res.set_bit(idx, that.value(idx));

      return res;
}

/* Explicitly define the vvp_vector4_t version of coerce_to_width(). */
template vvp_vector4_t coerce_to_width(const vvp_vector4_t&that,
                                       unsigned width);


static void multiply_array_imm(unsigned long*res, unsigned long*val,
			       unsigned words, unsigned long imm)
{
      for (unsigned idx = 0 ; idx < words ; idx += 1)
	    res[idx] = 0;

      for (unsigned mul_idx = 0 ; mul_idx < words ; mul_idx += 1) {
	    unsigned long sum;
	    unsigned long tmp = multiply_with_carry(val[mul_idx], imm, sum);

	    unsigned long carry = 0;
	    res[mul_idx] = add_with_carry(res[mul_idx], tmp, carry);
	    for (unsigned add_idx = mul_idx+1 ; add_idx < words ; add_idx += 1) {
		  res[add_idx] = add_with_carry(res[add_idx], sum, carry);
		  sum = 0;
	    }
      }
}

/*
 * Allocate a context for use by a child thread. By preference, use
 * the last freed context. If none available, create a new one. Add
 * it to the list of live contexts in that scope.
 */
static vvp_context_t vthread_alloc_context(__vpiScope*scope)
{
      assert(scope->is_automatic());

      vvp_context_t context = scope->free_contexts;
      if (context) {
            scope->free_contexts = vvp_get_next_context(context);
            for (unsigned idx = 0 ; idx < scope->nitem ; idx += 1) {
                  scope->item[idx]->reset_instance(context);
            }
      } else {
            context = vvp_allocate_context(scope->nitem);
            for (unsigned idx = 0 ; idx < scope->nitem ; idx += 1) {
                  scope->item[idx]->alloc_instance(context);
            }
      }

      vvp_set_next_context(context, scope->live_contexts);
      scope->live_contexts = context;

      return context;
}

/*
 * Free a context previously allocated to a child thread by pushing it
 * onto the freed context stack. Remove it from the list of live contexts
 * in that scope.
 */
static void vthread_free_context(vvp_context_t context, __vpiScope*scope)
{
      assert(scope->is_automatic());
      assert(context);

      if (context == scope->live_contexts) {
            scope->live_contexts = vvp_get_next_context(context);
      } else {
            vvp_context_t tmp = scope->live_contexts;
            while (context != vvp_get_next_context(tmp)) {
                  assert(tmp);
                  tmp = vvp_get_next_context(tmp);
            }
            vvp_set_next_context(tmp, vvp_get_next_context(context));
      }

      vvp_set_next_context(context, scope->free_contexts);
      scope->free_contexts = context;
}

#ifdef CHECK_WITH_VALGRIND
void contexts_delete(struct __vpiScope*scope)
{
      vvp_context_t context = scope->free_contexts;

      while (context) {
	    scope->free_contexts = vvp_get_next_context(context);
	    for (unsigned idx = 0; idx < scope->nitem; idx += 1) {
		  scope->item[idx]->free_instance(context);
	    }
	    free(context);
	    context = scope->free_contexts;
      }
      free(scope->item);
}
#endif

/*
 * Create a new thread with the given start address.
 */
vthread_t vthread_new(vvp_code_t pc, __vpiScope*scope)
{
      vthread_t thr = new struct vthread_s;
      thr->pc     = pc;
	//thr->bits4  = vvp_vector4_t(32);
      thr->parent = 0;
      thr->parent_scope = scope;
      thr->wait_next = 0;
      thr->wt_context = 0;
      thr->rd_context = 0;

      thr->i_am_joining  = 0;
      thr->i_am_detached = 0;
      thr->i_am_waiting  = 0;
      thr->i_am_in_function = 0;
      thr->is_scheduled  = 0;
      thr->i_have_ended  = 0;
      thr->i_was_disabled = 0;
      thr->delay_delete  = 0;
      thr->waiting_for_event = 0;
      thr->event  = 0;
      thr->ecount = 0;

      thr->flags[0] = BIT4_0;
      thr->flags[1] = BIT4_1;
      thr->flags[2] = BIT4_X;
      thr->flags[3] = BIT4_Z;
      for (int idx = 4 ; idx < 8 ; idx += 1)
	    thr->flags[idx] = BIT4_X;

      scope->threads .insert(thr);
      return thr;
}

#ifdef CHECK_WITH_VALGRIND
#if 0
/*
 * These are not currently correct. If you use them you will get
 * double delete messages. There is still a leak related to a
 * waiting event that needs to be investigated.
 */

static void wait_next_delete(vthread_t base)
{
      while (base) {
	    vthread_t tmp = base->wait_next;
	    delete base;
	    base = tmp;
	    if (base->waiting_for_event == 0) break;
      }
}

static void child_delete(vthread_t base)
{
      while (base) {
	    vthread_t tmp = base->child;
	    delete base;
	    base = tmp;
      }
}
#endif

void vthreads_delete(struct __vpiScope*scope)
{
      for (std::set<vthread_t>::iterator cur = scope->threads.begin()
		 ; cur != scope->threads.end() ; ++ cur ) {
	    delete *cur;
      }
      scope->threads.clear();
}
#endif

/*
 * Reaping pulls the thread out of the stack of threads. If I have a
 * child, then hand it over to my parent or fully detach it.
 */
static void vthread_reap(vthread_t thr)
{
      if (! thr->children.empty()) {
	    for (set<vthread_t>::iterator cur = thr->children.begin()
		       ; cur != thr->children.end() ; ++cur) {
		  vthread_t child = *cur;
		  assert(child);
		  assert(child->parent == thr);
		  child->parent = thr->parent;
	    }
      }
      if (! thr->detached_children.empty()) {
	    for (set<vthread_t>::iterator cur = thr->detached_children.begin()
		       ; cur != thr->detached_children.end() ; ++cur) {
		  vthread_t child = *cur;
		  assert(child);
		  assert(child->parent == thr);
		  assert(child->i_am_detached);
		  child->parent = 0;
		  child->i_am_detached = 0;
	    }
      }
      if (thr->parent) {
	      /* assert that the given element was removed. */
	    if (thr->i_am_detached) {
		  size_t res = thr->parent->detached_children.erase(thr);
		  assert(res == 1);
	    } else {
		  size_t res = thr->parent->children.erase(thr);
		  assert(res == 1);
	    }
      }

      thr->parent = 0;

	// Remove myself from the containing scope if needed.
      thr->parent_scope->threads.erase(thr);

      thr->pc = codespace_null();

	/* If this thread is not scheduled, then is it safe to delete
	   it now. Otherwise, let the schedule event (which will
	   execute the thread at of_ZOMBIE) delete the object. */
      if ((thr->is_scheduled == 0) && (thr->waiting_for_event == 0)) {
	    assert(thr->children.empty());
	    assert(thr->wait_next == 0);
	    if (thr->delay_delete)
		  schedule_del_thr(thr);
	    else
		  vthread_delete(thr);
      }
}

void vthread_delete(vthread_t thr)
{
      thr->cleanup();
      delete thr;
}

void vthread_mark_scheduled(vthread_t thr)
{
      while (thr != 0) {
	    assert(thr->is_scheduled == 0);
	    thr->is_scheduled = 1;
	    thr = thr->wait_next;
      }
}

void vthread_delay_delete()
{
      if (running_thread)
	    running_thread->delay_delete = 1;
}

/*
 * This function runs each thread by fetching an instruction,
 * incrementing the PC, and executing the instruction. The thread may
 * be the head of a list, so each thread is run so far as possible.
 */
void vthread_run(vthread_t thr)
{
      while (thr != 0) {
	    vthread_t tmp = thr->wait_next;
	    thr->wait_next = 0;

	    assert(thr->is_scheduled);
	    thr->is_scheduled = 0;

            running_thread = thr;

	    for (;;) {
		  vvp_code_t cp = thr->pc;
		  thr->pc += 1;

		    /* Run the opcode implementation. If the execution of
		       the opcode returns false, then the thread is meant to
		       be paused, so break out of the loop. */
		  bool rc = (cp->opcode)(thr, cp);
		  if (rc == false)
			break;
	    }

	    thr = tmp;
      }
      running_thread = 0;
}

/*
 * The CHUNK_LINK instruction is a special next pointer for linking
 * chunks of code space. It's like a simplified %jmp.
 */
bool of_CHUNK_LINK(vthread_t thr, vvp_code_t code)
{
      assert(code->cptr);
      thr->pc = code->cptr;
      return true;
}

/*
 * This is called by an event functor to wake up all the threads on
 * its list. I in fact created that list in the %wait instruction, and
 * I also am certain that the waiting_for_event flag is set.
 */
void vthread_schedule_list(vthread_t thr)
{
      for (vthread_t cur = thr ;  cur ;  cur = cur->wait_next) {
	    assert(cur->waiting_for_event);
	    cur->waiting_for_event = 0;
      }

      schedule_vthread(thr, 0);
}

vvp_context_t vthread_get_wt_context()
{
      if (running_thread)
            return running_thread->wt_context;
      else
            return 0;
}

vvp_context_t vthread_get_rd_context()
{
      if (running_thread)
            return running_thread->rd_context;
      else
            return 0;
}

vvp_context_item_t vthread_get_wt_context_item(unsigned context_idx)
{
      assert(running_thread && running_thread->wt_context);
      return vvp_get_context_item(running_thread->wt_context, context_idx);
}

vvp_context_item_t vthread_get_rd_context_item(unsigned context_idx)
{
      assert(running_thread && running_thread->rd_context);
      return vvp_get_context_item(running_thread->rd_context, context_idx);
}

/*
 * %abs/wr
 */
bool of_ABS_WR(vthread_t thr, vvp_code_t)
{
      thr->push_real( fabs(thr->pop_real()) );
      return true;
}

bool of_ALLOC(vthread_t thr, vvp_code_t cp)
{
        /* Allocate a context. */
      vvp_context_t child_context = vthread_alloc_context(cp->scope);

        /* Push the allocated context onto the write context stack. */
      vvp_set_stacked_context(child_context, thr->wt_context);
      thr->wt_context = child_context;

      return true;
}

bool of_AND(vthread_t thr, vvp_code_t)
{
      vvp_vector4_t valb = thr->pop_vec4();
      vvp_vector4_t&vala = thr->peek_vec4();
      assert(vala.size() == valb.size());
      vala &= valb;
      return true;
}

/*
 * This function must ALWAYS be called with the val set to the right
 * size, and initialized with BIT4_0 bits. Certain optimizations rely
 * on that.
 */
static void get_immediate_rval(vvp_code_t cp, vvp_vector4_t&val)
{
      uint32_t vala = cp->bit_idx[0];
      uint32_t valb = cp->bit_idx[1];
      unsigned wid  = cp->number;

      if (valb == 0) {
	      // Special case: if the value is zero, we are done
	      // before we start.
	    if (vala == 0) return;

	      // Special case: The value has no X/Z bits, so we can
	      // use the setarray method to write the value all at once.
	    unsigned use_wid = 8*sizeof(unsigned long);
	    if (wid < use_wid)
		  use_wid = wid;
	    unsigned long tmp[1];
	    tmp[0] = vala;
	    val.setarray(0, use_wid, tmp);
	    return;
      }

	// The immediate value can be values bigger then 32 bits, but
	// only if the high bits are zero. So at most we need to run
	// through the loop below 32 times. Maybe less, if the target
	// width is less. We don't have to do anything special on that
	// because vala/valb bits will shift away so (vala|valb) will
	// turn to zero at or before 32 shifts.

      for (unsigned idx = 0 ; idx < wid && (vala|valb) ; idx += 1) {
	    uint32_t ba = 0;
	      // Convert the vala/valb bits to a ba number that
	      // matches the encoding of the vvp_bit4_t enumeration.
	    ba = (valb & 1) << 1;
	    ba |= vala & 1;

	      // Note that the val is already pre-filled with BIT4_0
	      // bits, os we only need to set non-zero bit values.
	    if (ba) val.set_bit(idx, (vvp_bit4_t)ba);

	    vala >>= 1;
	    valb >>= 1;
      }
}

/*
 * %add
 *
 * Pop r,
 * Pop l,
 * Push l+r
 *
 * Pop 2 and push 1 is the same as pop 1 and replace the remaining top
 * of the stack with a new value. That is what we will do.
 */
bool of_ADD(vthread_t thr, vvp_code_t)
{
      vvp_vector4_t r = thr->pop_vec4();
	// Rather then pop l, use it directly from the stack. When we
	// assign to 'l', that will edit the top of the stack, which
	// replaces a pop and a pull.
      vvp_vector4_t&l = thr->peek_vec4();

      l.add(r);

      return true;
}

/*
 * %addi <vala>, <valb>, <wid>
 *
 * Pop1 operand, get the other operand from the arguments, and push
 * the result.
 */
bool of_ADDI(vthread_t thr, vvp_code_t cp)
{
      unsigned wid = cp->number;

      vvp_vector4_t&l = thr->peek_vec4();

	// I expect that most of the bits of an immediate value are
	// going to be zero, so start the result vector with all zero
	// bits. Then we only need to replace the bits that are different.
      vvp_vector4_t r (wid, BIT4_0);
      get_immediate_rval (cp, r);

      l.add(r);

      return true;
}

/*
 * %add/wr
 */
bool of_ADD_WR(vthread_t thr, vvp_code_t)
{
      double r = thr->pop_real();
      double l = thr->pop_real();
      thr->push_real(l + r);
      return true;
}

/* %assign/ar <array>, <delay>
 * Generate an assignment event to a real array. Index register 3
 * contains the canonical address of the word in the memory. <delay>
 * is the delay in simulation time. <bit> is the index register
 * containing the real value.
 */
bool of_ASSIGN_AR(vthread_t thr, vvp_code_t cp)
{
      long adr = thr->words[3].w_int;
      unsigned delay = cp->bit_idx[0];
      double value = thr->pop_real();

      if (adr >= 0) {
	    schedule_assign_array_word(cp->array, adr, value, delay);
      }

      return true;
}

/* %assign/ar/d <array>, <delay_idx>
 * Generate an assignment event to a real array. Index register 3
 * contains the canonical address of the word in the memory.
 * <delay_idx> is the integer register that contains the delay value.
 */
bool of_ASSIGN_ARD(vthread_t thr, vvp_code_t cp)
{
      long adr = thr->words[3].w_int;
      vvp_time64_t delay = thr->words[cp->bit_idx[0]].w_uint;
      double value = thr->pop_real();

      if (adr >= 0) {
	    schedule_assign_array_word(cp->array, adr, value, delay);
      }

      return true;
}

/* %assign/ar/e <array>
 * Generate an assignment event to a real array. Index register 3
 * contains the canonical address of the word in the memory. <bit>
 * is the index register containing the real value. The event
 * information is contained in the thread event control registers
 * and is set with %evctl.
 */
bool of_ASSIGN_ARE(vthread_t thr, vvp_code_t cp)
{
      long adr = thr->words[3].w_int;
      double value = thr->pop_real();

      if (adr >= 0) {
	    if (thr->ecount == 0) {
		  schedule_assign_array_word(cp->array, adr, value, 0);
	    } else {
		  schedule_evctl(cp->array, adr, value, thr->event,
		                 thr->ecount);
	    }
      }

      return true;
}

/*
 * %assign/vec4 <var>, <delay>
 */
bool of_ASSIGN_VEC4(vthread_t thr, vvp_code_t cp)
{
      vvp_net_ptr_t ptr (cp->net, 0);
      unsigned delay = cp->bit_idx[0];
      vvp_vector4_t&val = thr->peek_vec4();

      schedule_assign_vector(ptr, 0, 0, val, delay);
      thr->pop_vec4(1);
      return true;
}

/*
 * %assign/vec4/a/d <arr>, <offx>, <delx>
 */
bool of_ASSIGN_VEC4_A_D(vthread_t thr, vvp_code_t cp)
{
      int off_idx = cp->bit_idx[0];
      int del_idx = cp->bit_idx[1];
      int adr_idx = 3;

      long     off = off_idx? thr->words[off_idx].w_int  : 0;
      vvp_time64_t del = del_idx? thr->words[del_idx].w_uint : 0;
      long     adr = thr->words[adr_idx].w_int;

      vvp_vector4_t val = thr->pop_vec4();
      unsigned wid = val.size();
      const unsigned array_wid = cp->array->get_word_size();

	// Abort if flags[4] is set. This can happen if the calculation
	// into an index register failed.
      if (thr->flags[4] == BIT4_1)
	    return true;

      if (off >= (long)array_wid)
	    return true;
      if (off < 0) {
	    if ((unsigned)-off >= array_wid)
		  return true;

	    int use_off = -off;
	    assert(wid > (unsigned)use_off);
	    unsigned use_wid = wid - use_off;
	    val = val.subvalue(use_off, use_wid);
	    off = 0;
	    wid = use_wid;
      }
      if (off+wid > array_wid) {
	    val = val.subvalue(0, array_wid-off);
	    wid = val.size();
      }

      schedule_assign_array_word(cp->array, adr, off, val, del);

      return true;
}

/*
 * %assign/vec4/a/e <arr>, <offx>
 */
bool of_ASSIGN_VEC4_A_E(vthread_t thr, vvp_code_t cp)
{
      int off_idx = cp->bit_idx[0];
      int adr_idx = 3;

      long     off = off_idx? thr->words[off_idx].w_int  : 0;
      long     adr = thr->words[adr_idx].w_int;

      vvp_vector4_t val = thr->pop_vec4();
      unsigned wid = val.size();
      const unsigned array_wid = cp->array->get_word_size();

	// Abort if flags[4] is set. This can happen if the calculation
	// into an index register failed.
      if (thr->flags[4] == BIT4_1)
	    return true;

      if (off >= (long)array_wid)
	    return true;
      if (off < 0) {
	    if ((unsigned)-off >= array_wid)
		  return true;

	    int use_off = -off;
	    assert(wid > (unsigned)use_off);
	    unsigned use_wid = wid - use_off;
	    val = val.subvalue(use_off, use_wid);
	    off = 0;
	    wid = use_wid;
      }
      if (off+wid > array_wid) {
	    val = val.subvalue(0, array_wid-off);
	    wid = val.size();
      }

      if (thr->ecount == 0) {
	    schedule_assign_array_word(cp->array, adr, off, val, 0);
      } else {
	    schedule_evctl(cp->array, adr, val, off, thr->event, thr->ecount);
      }

      return true;
}

/*
 * %assign/vec4/off/d <var>, <off>, <del>
 */
bool of_ASSIGN_VEC4_OFF_D(vthread_t thr, vvp_code_t cp)
{
      vvp_net_ptr_t ptr (cp->net, 0);
      unsigned off_index = cp->bit_idx[0];
      unsigned del_index = cp->bit_idx[1];
      vvp_vector4_t val = thr->pop_vec4();
      unsigned wid = val.size();

      int off = thr->words[off_index].w_int;
      vvp_time64_t del = thr->words[del_index].w_uint;

	// Abort if flags[4] is set. This can happen if the calculation
	// into an index register failed.
      if (thr->flags[4] == BIT4_1)
	    return true;

      vvp_signal_value*sig = dynamic_cast<vvp_signal_value*> (cp->net->fil);
      assert(sig);

      if (off >= (long)sig->value_size())
	    return true;
      if (off < 0) {
	    if ((unsigned)-off >= wid)
		  return true;

	    int use_off = -off;
	    assert(wid > (unsigned)use_off);
	    unsigned use_wid = wid - use_off;
	    val = val.subvalue(use_off, use_wid);
	    off = 0;
	    wid = use_wid;
      }
      if (off+wid > sig->value_size()) {
	    val = val.subvalue(0, sig->value_size()-off);
	    wid = val.size();
      }

      schedule_assign_vector(ptr, off, sig->value_size(), val, del);
      return true;
}

/*
 * %assign/vec4/off/e <var>, <off>
 */
bool of_ASSIGN_VEC4_OFF_E(vthread_t thr, vvp_code_t cp)
{
      vvp_net_ptr_t ptr (cp->net, 0);
      unsigned off_index = cp->bit_idx[0];
      vvp_vector4_t val = thr->pop_vec4();
      unsigned wid = val.size();

      int off = thr->words[off_index].w_int;

	// Abort if flags[4] is set. This can happen if the calculation
	// into an index register failed.
      if (thr->flags[4] == BIT4_1)
	    return true;

      vvp_signal_value*sig = dynamic_cast<vvp_signal_value*> (cp->net->fil);
      assert(sig);

      if (off >= (long)sig->value_size())
	    return true;
      if (off < 0) {
	    if ((unsigned)-off >= wid)
		  return true;

	    int use_off = -off;
	    assert((int)wid > use_off);
	    unsigned use_wid = wid - use_off;
	    val = val.subvalue(use_off, use_wid);
	    off = 0;
	    wid = use_wid;
      }
      if (off+wid > sig->value_size()) {
	    val = val.subvalue(0, sig->value_size()-off);
	    wid = val.size();
      }

      if (thr->ecount == 0) {
	    schedule_assign_vector(ptr, off, sig->value_size(), val, 0);
      } else {
	    schedule_evctl(ptr, val, off, sig->value_size(), thr->event, thr->ecount);
      }

      return true;
}

/*
 * %assign/vec4/d <var-label> <delay>
 */
bool of_ASSIGN_VEC4D(vthread_t thr, vvp_code_t cp)
{
      vvp_net_ptr_t ptr (cp->net, 0);
      unsigned del_index = cp->bit_idx[0];
      vvp_time64_t del = thr->words[del_index].w_int;

      vvp_vector4_t value = thr->pop_vec4();

      vvp_signal_value*sig = dynamic_cast<vvp_signal_value*> (cp->net->fil);
      assert(sig);

      schedule_assign_vector(ptr, 0, sig->value_size(), value, del);

      return true;
}

/*
 * %assign/vec4/e <var-label>
 */
bool of_ASSIGN_VEC4E(vthread_t thr, vvp_code_t cp)
{
      vvp_net_ptr_t ptr (cp->net, 0);
      vvp_vector4_t value = thr->pop_vec4();

      vvp_signal_value*sig = dynamic_cast<vvp_signal_value*> (cp->net->fil);
      assert(sig);

      if (thr->ecount == 0) {
	    schedule_assign_vector(ptr, 0, sig->value_size(), value, 0);
      } else {
	    schedule_evctl(ptr, value, 0, sig->value_size(), thr->event, thr->ecount);
      }

      thr->event = 0;
      thr->ecount = 0;
      return true;
}

/*
 * This is %assign/wr <vpi-label>, <delay>
 *
 * This assigns (after a delay) a value to a real variable. Use the
 * vpi_put_value function to do the assign, with the delay written
 * into the vpiInertialDelay carrying the desired delay.
 */
bool of_ASSIGN_WR(vthread_t thr, vvp_code_t cp)
{
      unsigned delay = cp->bit_idx[0];
      double value = thr->pop_real();
      s_vpi_time del;

      del.type = vpiSimTime;
      vpip_time_to_timestruct(&del, delay);

      __vpiHandle*tmp = cp->handle;

      t_vpi_value val;
      val.format = vpiRealVal;
      val.value.real = value;
      vpi_put_value(tmp, &val, &del, vpiTransportDelay);

      return true;
}

bool of_ASSIGN_WRD(vthread_t thr, vvp_code_t cp)
{
      vvp_time64_t delay = thr->words[cp->bit_idx[0]].w_uint;
      double value = thr->pop_real();
      s_vpi_time del;

      del.type = vpiSimTime;
      vpip_time_to_timestruct(&del, delay);

      __vpiHandle*tmp = cp->handle;

      t_vpi_value val;
      val.format = vpiRealVal;
      val.value.real = value;
      vpi_put_value(tmp, &val, &del, vpiTransportDelay);

      return true;
}

bool of_ASSIGN_WRE(vthread_t thr, vvp_code_t cp)
{
      assert(thr->event != 0);
      double value = thr->pop_real();
      __vpiHandle*tmp = cp->handle;

	// If the count is zero then just put the value.
      if (thr->ecount == 0) {
	    t_vpi_value val;

	    val.format = vpiRealVal;
	    val.value.real = value;
	    vpi_put_value(tmp, &val, 0, vpiNoDelay);
      } else {
	    schedule_evctl(tmp, value, thr->event, thr->ecount);
      }

      thr->event = 0;
      thr->ecount = 0;

      return true;
}

bool of_BLEND(vthread_t thr, vvp_code_t)
{
      vvp_vector4_t vala = thr->pop_vec4();
      vvp_vector4_t valb = thr->pop_vec4();
      assert(vala.size() == valb.size());

      for (unsigned idx = 0 ; idx < vala.size() ; idx += 1) {
	    if (vala.value(idx) == valb.value(idx))
		  continue;

	    vala.set_bit(idx, BIT4_X);
      }

      thr->push_vec4(vala);
      return true;
}

bool of_BLEND_WR(vthread_t thr, vvp_code_t)
{
      double f = thr->pop_real();
      double t = thr->pop_real();
      thr->push_real((t == f) ? t : 0.0);
      return true;
}

bool of_BREAKPOINT(vthread_t, vvp_code_t)
{
      return true;
}

/*
 * %callf/void <code-label>, <scope-label>
 * Combine the %fork and %join steps for invoking a function.
 */
static bool do_callf_void(vthread_t thr, vthread_t child)
{

      if (child->parent_scope->is_automatic()) {
	      /* The context allocated for this child is the top entry
		 on the write context stack */
	    child->wt_context = thr->wt_context;
	    child->rd_context = thr->wt_context;
      }

        // Mark the function thread as a direct child of the current thread.
      child->parent = thr;
      thr->children.insert(child);
        // This should be the only child
      assert(thr->children.size()==1);

        // Execute the function. This SHOULD run the function to completion,
        // but there are some exceptional situations where it won't.
      assert(child->parent_scope->get_type_code() == vpiFunction);
      thr->task_func_children.insert(child);
      child->is_scheduled = 1;
      child->i_am_in_function = 1;
      vthread_run(child);
      running_thread = thr;

      assert(test_joinable(thr, child));

      if (child->i_have_ended) {
	    do_join(thr, child);
	    return true;
      } else {
	    thr->i_am_joining = 1;
	    return false;
      }
}

bool of_CALLF_OBJ(vthread_t thr, vvp_code_t cp)
{
      vthread_t child = vthread_new(cp->cptr2, cp->scope);
      return do_callf_void(thr, child);

      // XXXX NOT IMPLEMENTED
}

bool of_CALLF_REAL(vthread_t thr, vvp_code_t cp)
{
      vthread_t child = vthread_new(cp->cptr2, cp->scope);

	// This is the return value. Push a place-holder value. The function
	// will replace this with the actual value using a %ret/real instruction.
      thr->push_real(0.0);
      child->args_real.push_back(0);

      return do_callf_void(thr, child);
}

bool of_CALLF_STR(vthread_t thr, vvp_code_t cp)
{
      vthread_t child = vthread_new(cp->cptr2, cp->scope);

      thr->push_str("");
      child->args_str.push_back(0);

      return do_callf_void(thr, child);
}

bool of_CALLF_VEC4(vthread_t thr, vvp_code_t cp)
{
      vthread_t child = vthread_new(cp->cptr2, cp->scope);

      vpiScopeFunction*scope_func = dynamic_cast<vpiScopeFunction*>(cp->scope);
      assert(scope_func);

	// This is the return value. Push a place-holder value. The function
	// will replace this with the actual value using a %ret/real instruction.
      thr->push_vec4(vvp_vector4_t(scope_func->get_func_width()));
      child->args_vec4.push_back(0);

      return do_callf_void(thr, child);
}

bool of_CALLF_VOID(vthread_t thr, vvp_code_t cp)
{
      vthread_t child = vthread_new(cp->cptr2, cp->scope);
      return do_callf_void(thr, child);
}

/*
 * The %cassign/link instruction connects a source node to a
 * destination node. The destination node must be a signal, as it is
 * marked with the source of the cassign so that it may later be
 * unlinked without specifically knowing the source that this
 * instruction used.
 */
bool of_CASSIGN_LINK(vthread_t, vvp_code_t cp)
{
      vvp_net_t*dst = cp->net;
      vvp_net_t*src = cp->net2;

      vvp_fun_signal_base*sig
	    = dynamic_cast<vvp_fun_signal_base*>(dst->fun);
      assert(sig);

	/* Any previous continuous assign should have been removed already. */
      assert(sig->cassign_link == 0);

      sig->cassign_link = src;

	/* Link the output of the src to the port[1] (the cassign
	   port) of the destination. */
      vvp_net_ptr_t dst_ptr (dst, 1);
      src->link(dst_ptr);

      return true;
}

/*
 * If there is an existing continuous assign linked to the destination
 * node, unlink it. This must be done before applying a new continuous
 * assign, otherwise the initial assigned value will be propagated to
 * any other nodes driven by the old continuous assign source.
 */
static void cassign_unlink(vvp_net_t*dst)
{
      vvp_fun_signal_base*sig
	    = dynamic_cast<vvp_fun_signal_base*>(dst->fun);
      assert(sig);

      if (sig->cassign_link == 0)
	    return;

      vvp_net_ptr_t tmp (dst, 1);
      sig->cassign_link->unlink(tmp);
      sig->cassign_link = 0;
}

/*
 * The %cassign/v instruction invokes a continuous assign of a
 * constant value to a signal. The instruction arguments are:
 *
 *     %cassign/vec4 <net>;
 *
 * Where the <net> is the net label assembled into a vvp_net pointer,
 * and the <base> and <wid> are stashed in the bit_idx array.
 *
 * This instruction writes vvp_vector4_t values to port-1 of the
 * target signal.
 */
bool of_CASSIGN_VEC4(vthread_t thr, vvp_code_t cp)
{
      vvp_net_t*net = cp->net;
      vvp_vector4_t value = thr->pop_vec4();

	/* Remove any previous continuous assign to this net. */
      cassign_unlink(net);

	/* Set the value into port 1 of the destination. */
      vvp_net_ptr_t ptr (net, 1);
      vvp_send_vec4(ptr, value, 0);

      return true;
}

/*
 * %cassign/vec4/off <var>, <off>
 */
bool of_CASSIGN_VEC4_OFF(vthread_t thr, vvp_code_t cp)
{
      vvp_net_t*net = cp->net;
      unsigned base_idx = cp->bit_idx[0];
      long base = thr->words[base_idx].w_int;
      vvp_vector4_t value = thr->pop_vec4();
      unsigned wid = value.size();

      if (thr->flags[4] == BIT4_1)
	    return true;

	/* Remove any previous continuous assign to this net. */
      cassign_unlink(net);

      vvp_signal_value*sig = dynamic_cast<vvp_signal_value*> (net->fil);
      assert(sig);

      if (base < 0 && (wid <= (unsigned)-base))
	    return true;

      if (base >= (long)sig->value_size())
	    return true;

      if (base < 0) {
	    wid -= (unsigned) -base;
	    base = 0;
	    value.resize(wid);
      }

      if (base+wid > sig->value_size()) {
	    wid = sig->value_size() - base;
	    value.resize(wid);
      }

      vvp_net_ptr_t ptr (net, 1);
      vvp_send_vec4_pv(ptr, value, base, wid, sig->value_size(), 0);
      return true;
}

bool of_CASSIGN_WR(vthread_t thr, vvp_code_t cp)
{
      vvp_net_t*net  = cp->net;
      double value = thr->pop_real();

	/* Remove any previous continuous assign to this net. */
      cassign_unlink(net);

	/* Set the value into port 1 of the destination. */
      vvp_net_ptr_t ptr (net, 1);
      vvp_send_real(ptr, value, 0);

      return true;
}

/*
 * %cast2
 */
bool of_CAST2(vthread_t thr, vvp_code_t)
{
      vvp_vector4_t&val = thr->peek_vec4();
      unsigned wid = val.size();

      for (unsigned idx = 0 ; idx < wid ; idx += 1) {
	    switch (val.value(idx)) {
		case BIT4_0:
		case BIT4_1:
		  break;
		default:
		  val.set_bit(idx, BIT4_0);
		  break;
	    }
      }

      return true;
}

static void do_CMPE(vthread_t thr, const vvp_vector4_t&lval, const vvp_vector4_t&rval)
{
      assert(rval.size() == lval.size());

      if (lval.has_xz() || rval.has_xz()) {

	    unsigned wid = lval.size();
	    vvp_bit4_t eq  = BIT4_1;
	    vvp_bit4_t eeq = BIT4_1;

	    for (unsigned idx = 0 ; idx < wid ; idx += 1) {
		  vvp_bit4_t lv = lval.value(idx);
		  vvp_bit4_t rv = rval.value(idx);

		  if (lv != rv)
			eeq = BIT4_0;

		  if (eq==BIT4_1 && (bit4_is_xz(lv) || bit4_is_xz(rv)))
			eq = BIT4_X;
		  if ((lv == BIT4_0) && (rv==BIT4_1))
			eq = BIT4_0;
		  if ((lv == BIT4_1) && (rv==BIT4_0))
			eq = BIT4_0;

		  if (eq == BIT4_0)
			break;
	    }

	    thr->flags[4] = eq;
	    thr->flags[6] = eeq;

      } else {
	      // If there are no XZ bits anywhere, then the results of
	      // == match the === test.
	    thr->flags[4] = thr->flags[6] = (lval.eeq(rval)? BIT4_1 : BIT4_0);
      }
}

/*
 *  %cmp/e
 *
 * Pop the operands from the stack, and do not replace them. The
 * results are written to flag bits:
 *
 *	4: eq  (equal)
 *
 *	6: eeq (case equal)
 */
bool of_CMPE(vthread_t thr, vvp_code_t)
{
	// We are going to pop these and push nothing in their
	// place, but for now it is more efficient to use a constant
	// reference. When we finish, pop the stack without copies.
      const vvp_vector4_t&rval = thr->peek_vec4(0);
      const vvp_vector4_t&lval = thr->peek_vec4(1);

      do_CMPE(thr, lval, rval);

      thr->pop_vec4(2);
      return true;
}

bool of_CMPNE(vthread_t thr, vvp_code_t)
{
	// We are going to pop these and push nothing in their
	// place, but for now it is more efficient to use a constant
	// reference. When we finish, pop the stack without copies.
      const vvp_vector4_t&rval = thr->peek_vec4(0);
      const vvp_vector4_t&lval = thr->peek_vec4(1);

      do_CMPE(thr, lval, rval);

      thr->flags[4] =  ~thr->flags[4];
      thr->flags[6] =  ~thr->flags[6];

      thr->pop_vec4(2);
      return true;
}

/*
 * %cmpi/e <vala>, <valb>, <wid>
 *
 * Pop1 operand, get the other operand from the arguments.
 */
bool of_CMPIE(vthread_t thr, vvp_code_t cp)
{
      unsigned wid = cp->number;

      vvp_vector4_t&lval = thr->peek_vec4();

	// I expect that most of the bits of an immediate value are
	// going to be zero, so start the result vector with all zero
	// bits. Then we only need to replace the bits that are different.
      vvp_vector4_t rval (wid, BIT4_0);
      get_immediate_rval (cp, rval);

      do_CMPE(thr, lval, rval);

      thr->pop_vec4(1);
      return true;
}

bool of_CMPINE(vthread_t thr, vvp_code_t cp)
{
      unsigned wid = cp->number;

      vvp_vector4_t&lval = thr->peek_vec4();

	// I expect that most of the bits of an immediate value are
	// going to be zero, so start the result vector with all zero
	// bits. Then we only need to replace the bits that are different.
      vvp_vector4_t rval (wid, BIT4_0);
      get_immediate_rval (cp, rval);

      do_CMPE(thr, lval, rval);

      thr->flags[4] =  ~thr->flags[4];
      thr->flags[6] =  ~thr->flags[6];

      thr->pop_vec4(1);
      return true;
}


static void do_CMPS(vthread_t thr, const vvp_vector4_t&lval, const vvp_vector4_t&rval)
{
      assert(rval.size() == lval.size());

	// If either value has XZ bits, then the eq and lt values are
	// known already to be X. Just calculate the eeq result as a
	// special case and short circuit the rest of the compare.
      if (lval.has_xz() || rval.has_xz()) {
	    thr->flags[4] = BIT4_X; // eq
	    thr->flags[5] = BIT4_X; // lt
	    thr->flags[6] = lval.eeq(rval)? BIT4_1 : BIT4_0;
	    return;
      }

	// Past this point, we know we are dealing only with fully
	// defined values.
      unsigned wid = lval.size();

      const vvp_bit4_t sig1 = lval.value(wid-1);
      const vvp_bit4_t sig2 = rval.value(wid-1);

	// If the lval is <0 and the rval is >=0, then we know the result.
      if ((sig1 == BIT4_1) && (sig2 == BIT4_0)) {
	    thr->flags[4] = BIT4_0; // eq;
	    thr->flags[5] = BIT4_1; // lt;
	    thr->flags[6] = BIT4_0; // eeq
	    return;
      }

	// If the lval is >=0 and the rval is <0, then we know the result.
      if ((sig1 == BIT4_0) && (sig2 == BIT4_1)) {
	    thr->flags[4] = BIT4_0; // eq;
	    thr->flags[5] = BIT4_0; // lt;
	    thr->flags[6] = BIT4_0; // eeq
	    return;
      }

	// The values have the same sign, so we have to look at the
	// actual value. Scan from the MSB down. As soon as we find a
	// bit that differs, we know the result.

      for (unsigned idx = 1 ;  idx < wid ;  idx += 1) {
	    vvp_bit4_t lv = lval.value(wid-1-idx);
	    vvp_bit4_t rv = rval.value(wid-1-idx);

	    if (lv == rv)
		  continue;

	    thr->flags[4] = BIT4_0; // eq
	    thr->flags[6] = BIT4_0; // eeq

	    if (lv==BIT4_0) {
		  thr->flags[5] = BIT4_1; // lt
	    } else {
		  thr->flags[5] = BIT4_0; // lt
	    }
	    return;
      }

	// If we survive the loop above, then the values must be equal.
      thr->flags[4] = BIT4_1;
      thr->flags[5] = BIT4_0;
      thr->flags[6] = BIT4_1;
}

/*
 *  %cmp/s
 *
 * Pop the operands from the stack, and do not replace them. The
 * results are written to flag bits:
 *
 *	4: eq  (equal)
 *	5: lt  (less than)
 *	6: eeq (case equal)
 */
bool of_CMPS(vthread_t thr, vvp_code_t)
{
	// We are going to pop these and push nothing in their
	// place, but for now it is more efficient to use a constant
	// reference. When we finish, pop the stack without copies.
      const vvp_vector4_t&rval = thr->peek_vec4(0);
      const vvp_vector4_t&lval = thr->peek_vec4(1);

      do_CMPS(thr, lval, rval);

      thr->pop_vec4(2);
      return true;
}

/*
 * %cmpi/s <vala>, <valb>, <wid>
 *
 * Pop1 operand, get the other operand from the arguments.
 */
bool of_CMPIS(vthread_t thr, vvp_code_t cp)
{
      unsigned wid = cp->number;

      vvp_vector4_t&lval = thr->peek_vec4();

	// I expect that most of the bits of an immediate value are
	// going to be zero, so start the result vector with all zero
	// bits. Then we only need to replace the bits that are different.
      vvp_vector4_t rval (wid, BIT4_0);
      get_immediate_rval (cp, rval);

      do_CMPS(thr, lval, rval);

      thr->pop_vec4(1);
      return true;
}

bool of_CMPSTR(vthread_t thr, vvp_code_t)
{
      string re = thr->pop_str();
      string le = thr->pop_str();

      int rc = strcmp(le.c_str(), re.c_str());

      vvp_bit4_t eq;
      vvp_bit4_t lt;

      if (rc == 0) {
	    eq = BIT4_1;
	    lt = BIT4_0;
      } else if (rc < 0) {
	    eq = BIT4_0;
	    lt = BIT4_1;
      } else {
	    eq = BIT4_0;
	    lt = BIT4_0;
      }

      thr->flags[4] = eq;
      thr->flags[5] = lt;

      return true;
}

static void of_CMPU_the_hard_way(vthread_t thr, unsigned wid,
				 const vvp_vector4_t&lval,
				 const vvp_vector4_t&rval)
{
      vvp_bit4_t eq = BIT4_1;
      vvp_bit4_t eeq = BIT4_1;

      for (unsigned idx = 0 ;  idx < wid ;  idx += 1) {
	    vvp_bit4_t lv = lval.value(idx);
	    vvp_bit4_t rv = rval.value(idx);

	    if (lv != rv)
		  eeq = BIT4_0;

	    if (eq==BIT4_1 && (bit4_is_xz(lv) || bit4_is_xz(rv)))
		  eq = BIT4_X;
	    if ((lv == BIT4_0) && (rv==BIT4_1))
		  eq = BIT4_0;
	    if ((lv == BIT4_1) && (rv==BIT4_0))
		  eq = BIT4_0;

	    if (eq == BIT4_0)
		  break;

      }

      thr->flags[4] = eq;
      thr->flags[5] = BIT4_X;
      thr->flags[6] = eeq;
}

static void do_CMPU(vthread_t thr, const vvp_vector4_t&lval, const vvp_vector4_t&rval)
{
      vvp_bit4_t eq = BIT4_1;
      vvp_bit4_t lt = BIT4_0;

      if (rval.size() != lval.size()) {
	    cerr << "VVP ERROR: %cmp/u operand width mismatch: lval=" << lval
		 << ", rval=" << rval << endl;
      }
      assert(rval.size() == lval.size());
      unsigned wid = lval.size();

      unsigned long*larray = lval.subarray(0,wid);
      if (larray == 0) return of_CMPU_the_hard_way(thr, wid, lval, rval);

      unsigned long*rarray = rval.subarray(0,wid);
      if (rarray == 0) {
	    delete[]larray;
	    return of_CMPU_the_hard_way(thr, wid, lval, rval);
      }

      unsigned words = (wid+CPU_WORD_BITS-1) / CPU_WORD_BITS;

      for (unsigned wdx = 0 ; wdx < words ; wdx += 1) {
	    if (larray[wdx] == rarray[wdx])
		  continue;

	    eq = BIT4_0;
	    if (larray[wdx] < rarray[wdx])
		  lt = BIT4_1;
	    else
		  lt = BIT4_0;
      }

      delete[]larray;
      delete[]rarray;

      thr->flags[4] = eq;
      thr->flags[5] = lt;
      thr->flags[6] = eq;
}

bool of_CMPU(vthread_t thr, vvp_code_t)
{

      const vvp_vector4_t&rval = thr->peek_vec4(0);
      const vvp_vector4_t&lval = thr->peek_vec4(1);

      do_CMPU(thr, lval, rval);

      thr->pop_vec4(2);
      return true;
}

/*
 * %cmpi/u <vala>, <valb>, <wid>
 *
 * Pop1 operand, get the other operand from the arguments.
 */
bool of_CMPIU(vthread_t thr, vvp_code_t cp)
{
      unsigned wid = cp->number;

      vvp_vector4_t&lval = thr->peek_vec4();

	// I expect that most of the bits of an immediate value are
	// going to be zero, so start the result vector with all zero
	// bits. Then we only need to replace the bits that are different.
      vvp_vector4_t rval (wid, BIT4_0);
      get_immediate_rval (cp, rval);

      do_CMPU(thr, lval, rval);

      thr->pop_vec4(1);
      return true;
}


/*
 * %cmp/x
 */
bool of_CMPX(vthread_t thr, vvp_code_t)
{
      vvp_bit4_t eq = BIT4_1;
      vvp_vector4_t rval = thr->pop_vec4();
      vvp_vector4_t lval = thr->pop_vec4();

      assert(rval.size() == lval.size());
      unsigned wid = lval.size();

      for (unsigned idx = 0 ; idx < wid ; idx += 1) {
	    vvp_bit4_t lv = lval.value(idx);
	    vvp_bit4_t rv = rval.value(idx);
	    if ((lv != rv) && !bit4_is_xz(lv) && !bit4_is_xz(rv)) {
		  eq = BIT4_0;
		  break;
	    }
      }

      thr->flags[4] = eq;
      return true;
}

bool of_CMPWR(vthread_t thr, vvp_code_t)
{
      double r = thr->pop_real();
      double l = thr->pop_real();

      vvp_bit4_t eq = (l == r)? BIT4_1 : BIT4_0;
      vvp_bit4_t lt = (l <  r)? BIT4_1 : BIT4_0;

      thr->flags[4] = eq;
      thr->flags[5] = lt;

      return true;
}

bool of_CMPWS(vthread_t thr, vvp_code_t cp)
{
      int64_t l = thr->words[cp->bit_idx[0]].w_int;
      int64_t r = thr->words[cp->bit_idx[1]].w_int;

      vvp_bit4_t eq = (l == r)? BIT4_1 : BIT4_0;
      vvp_bit4_t lt = (l <  r)? BIT4_1 : BIT4_0;

      thr->flags[4] = eq;
      thr->flags[5] = lt;

      return true;
}

bool of_CMPWU(vthread_t thr, vvp_code_t cp)
{
      uint64_t l = thr->words[cp->bit_idx[0]].w_uint;
      uint64_t r = thr->words[cp->bit_idx[1]].w_uint;

      vvp_bit4_t eq = (l == r)? BIT4_1 : BIT4_0;
      vvp_bit4_t lt = (l <  r)? BIT4_1 : BIT4_0;

      thr->flags[4] = eq;
      thr->flags[5] = lt;

      return true;
}

/*
 * %cmp/z
 */
bool of_CMPZ(vthread_t thr, vvp_code_t)
{
      vvp_bit4_t eq = BIT4_1;
      vvp_vector4_t rval = thr->pop_vec4();
      vvp_vector4_t lval = thr->pop_vec4();

      assert(rval.size() == lval.size());
      unsigned wid = lval.size();

      for (unsigned idx = 0 ; idx < wid ; idx += 1) {
	    vvp_bit4_t lv = lval.value(idx);
	    vvp_bit4_t rv = rval.value(idx);
	    if ((lv != rv) && (rv != BIT4_Z) && (lv != BIT4_Z)) {
		  eq = BIT4_0;
		  break;
	    }
      }

      thr->flags[4] = eq;
      return true;
}

/*
 *  %concat/str;
 */
bool of_CONCAT_STR(vthread_t thr, vvp_code_t)
{
      string text = thr->pop_str();
      thr->peek_str(0).append(text);
      return true;
}

/*
 *  %concati/str <string>;
 */
bool of_CONCATI_STR(vthread_t thr, vvp_code_t cp)
{
      const char*text = cp->text;
      thr->peek_str(0).append(text);
      return true;
}

/*
 * %concat/vec4
 */
bool of_CONCAT_VEC4(vthread_t thr, vvp_code_t)
{
      const vvp_vector4_t&lsb = thr->peek_vec4(0);
      const vvp_vector4_t&msb = thr->peek_vec4(1);

	// The result is the size of the top two vectors in the stack.
      vvp_vector4_t res (msb.size() + lsb.size(), BIT4_X);

	// Build up the result.
      res.set_vec(0, lsb);
      res.set_vec(lsb.size(), msb);

	// Rearrange the stack to pop the inputs and push the
	// result. Do that by actually popping only 1 stack position
	// and replacing the new top with the new value.
      thr->pop_vec4(1);
      thr->peek_vec4() = res;

      return true;
}

/*
 * %concati/vec4 <vala>, <valb>, <wid>
 *
 * Concat the immediate value to the LOW bits of the concatenation.
 * Get the HIGH bits from the top of the vec4 stack.
 */
bool of_CONCATI_VEC4(vthread_t thr, vvp_code_t cp)
{
      uint32_t vala = cp->bit_idx[0];
      uint32_t valb = cp->bit_idx[1];
      unsigned wid  = cp->number;

      vvp_vector4_t&msb = thr->peek_vec4();

	// I expect that most of the bits of an immediate value are
	// going to be zero, so start the result vector with all zero
	// bits. Then we only need to replace the bits that are different.
      vvp_vector4_t lsb (wid, BIT4_0);

	// The %concati/vec4 can create values bigger then 32 bits, but
	// only if the high bits are zero. So at most we need to run
	// through the loop below 32 times. Maybe less, if the target
	// width is less. We don't have to do anything special on that
	// because vala/valb bits will shift away so (vala|valb) will
	// turn to zero at or before 32 shifts.

      for (unsigned idx = 0 ; idx < wid && (vala|valb) ; idx += 1) {
	    uint32_t ba = 0;
	      // Convert the vala/valb bits to a ba number that can be
	      // used to select what goes into the value.
	    ba = (valb & 1) << 1;
	    ba |= vala & 1;

	    switch (ba) {
		case 1:
		  lsb.set_bit(idx, BIT4_1);
		  break;
		case 2:
		  lsb.set_bit(idx, BIT4_Z);
		  break;
		case 3:
		  lsb.set_bit(idx, BIT4_X);
		  break;
		default:
		  break;
	    }

	    vala >>= 1;
	    valb >>= 1;
      }

      vvp_vector4_t res (msb.size()+lsb.size(), BIT4_X);
      res.set_vec(0, lsb);
      res.set_vec(lsb.size(), msb);

      msb = res;
      return true;
}

/*
 * %cvt/rv
 */
bool of_CVT_RV(vthread_t thr, vvp_code_t)
{
      double val;
      vvp_vector4_t val4 = thr->pop_vec4();
      vector4_to_value(val4, val, false);
      thr->push_real(val);
      return true;
}

/*
 * %cvt/rv/s
 */
bool of_CVT_RV_S(vthread_t thr, vvp_code_t)
{
      double val;
      vvp_vector4_t val4 = thr->pop_vec4();
      vector4_to_value(val4, val, true);
      thr->push_real(val);
      return true;
}

/*
 * %cvt/sr <idx>
 * Pop the top value from the real stack, convert it to a 64bit signed
 * and save it to the indexed register.
 */
bool of_CVT_SR(vthread_t thr, vvp_code_t cp)
{
      double r = thr->pop_real();
      thr->words[cp->bit_idx[0]].w_int = i64round(r);

      return true;
}

/*
 * %cvt/ur <idx>
 */
bool of_CVT_UR(vthread_t thr, vvp_code_t cp)
{
      double r = thr->pop_real();
      if (r >= 0.0)
	    thr->words[cp->bit_idx[0]].w_uint = (uint64_t)floor(r+0.5);
      else
	    thr->words[cp->bit_idx[0]].w_uint = (uint64_t)ceil(r-0.5);

      return true;
}

/*
 * %cvt/vr <wid>
 */
bool of_CVT_VR(vthread_t thr, vvp_code_t cp)
{
      double r = thr->pop_real();
      unsigned wid = cp->number;

      vvp_vector4_t tmp(wid, r);
      thr->push_vec4(tmp);
      return true;
}

/*
 * This implements the %deassign instruction. All we do is write a
 * long(1) to port-3 of the addressed net. This turns off an active
 * continuous assign activated by %cassign/v
 */
bool of_DEASSIGN(vthread_t, vvp_code_t cp)
{
      vvp_net_t*net = cp->net;
      unsigned base  = cp->bit_idx[0];
      unsigned width = cp->bit_idx[1];

      vvp_signal_value*fil = dynamic_cast<vvp_signal_value*> (net->fil);
      assert(fil);
      vvp_fun_signal_vec*sig = dynamic_cast<vvp_fun_signal_vec*>(net->fun);
      assert(sig);

      if (base >= fil->value_size()) return true;
      if (base+width > fil->value_size()) width = fil->value_size() - base;

      bool full_sig = base == 0 && width == fil->value_size();

	// This is the net that is forcing me...
      if (vvp_net_t*src = sig->cassign_link) {
	    if (! full_sig) {
		  fprintf(stderr, "Sorry: when a signal is assigning a "
		          "register, I cannot deassign part of it.\n");
		  exit(1);
	    }
	      // And this is the pointer to be removed.
	    vvp_net_ptr_t dst_ptr (net, 1);
	    src->unlink(dst_ptr);
	    sig->cassign_link = 0;
      }

	/* Do we release all or part of the net? */
      if (full_sig) {
	    sig->deassign();
      } else {
	    sig->deassign_pv(base, width);
      }

      return true;
}

bool of_DEASSIGN_WR(vthread_t, vvp_code_t cp)
{
      vvp_net_t*net = cp->net;

      vvp_fun_signal_real*sig = dynamic_cast<vvp_fun_signal_real*>(net->fun);
      assert(sig);

	// This is the net that is forcing me...
      if (vvp_net_t*src = sig->cassign_link) {
	      // And this is the pointer to be removed.
	    vvp_net_ptr_t dst_ptr (net, 1);
	    src->unlink(dst_ptr);
	    sig->cassign_link = 0;
      }

      sig->deassign();

      return true;
}

/*
 * %debug/thr
 */
bool of_DEBUG_THR(vthread_t thr, vvp_code_t cp)
{
      const char*text = cp->text;
      thr->debug_dump(cerr, text);
      return true;
}

/*
 * The delay takes two 32bit numbers to make up a 64bit time.
 *
 *   %delay <low>, <hig>
 */
bool of_DELAY(vthread_t thr, vvp_code_t cp)
{
      vvp_time64_t low = cp->bit_idx[0];
      vvp_time64_t hig = cp->bit_idx[1];

      vvp_time64_t res = 32;
      res = hig << res;
      res += low;

      schedule_vthread(thr, res);
      return false;
}

bool of_DELAYX(vthread_t thr, vvp_code_t cp)
{
      vvp_time64_t delay;

      assert(cp->number < vthread_s::WORDS_COUNT);
      delay = thr->words[cp->number].w_uint;
      schedule_vthread(thr, delay);
      return false;
}

/* %delete/obj <label>
 *
 * This operator works by assigning a nil to the target object. This
 * causes any value that might be there to be garbage collected, thus
 * deleting the object.
 */
bool of_DELETE_OBJ(vthread_t thr, vvp_code_t cp)
{
	/* set the value into port 0 of the destination. */
      vvp_net_ptr_t ptr (cp->net, 0);
      vvp_send_object(ptr, vvp_object_t(), thr->wt_context);

      return true;
}

static bool do_disable(vthread_t thr, vthread_t match)
{
      bool flag = false;

	/* Pull the target thread out of its scope if needed. */
      thr->parent_scope->threads.erase(thr);

	/* Turn the thread off by setting is program counter to
	   zero and setting an OFF bit. */
      thr->pc = codespace_null();
      thr->i_was_disabled = 1;
      thr->i_have_ended = 1;

	/* Turn off all the children of the thread. Simulate a %join
	   for as many times as needed to clear the results of all the
	   %forks that this thread has done. */
      while (! thr->children.empty()) {

	    vthread_t tmp = *(thr->children.begin());
	    assert(tmp);
	    assert(tmp->parent == thr);
	    thr->i_am_joining = 0;
	    if (do_disable(tmp, match))
		  flag = true;

	    vthread_reap(tmp);
      }

      vthread_t parent = thr->parent;
      if (parent && parent->i_am_joining) {
	      // If a parent is waiting in a %join, wake it up. Note
	      // that it is possible to be waiting in a %join yet
	      // already scheduled if multiple child threads are
	      // ending. So check if the thread is already scheduled
	      // before scheduling it again.
	    parent->i_am_joining = 0;
	    if (! parent->i_have_ended)
		  schedule_vthread(parent, 0, true);

	      // Let the parent do the reaping.
	    vthread_reap(thr);

      } else if (parent) {
	      /* If the parent is yet to %join me, let its %join
		 do the reaping. */
	      //assert(tmp->is_scheduled == 0);

      } else {
	      /* No parent at all. Goodbye. */
	    vthread_reap(thr);
      }

      return flag || (thr == match);
}

/*
 * Implement the %disable instruction by scanning the target scope for
 * all the target threads. Kill the target threads and wake up a
 * parent that is attempting a %join.
 */
bool of_DISABLE(vthread_t thr, vvp_code_t cp)
{
      __vpiScope*scope = (__vpiScope*)cp->handle;

      bool disabled_myself_flag = false;

      while (! scope->threads.empty()) {
	    set<vthread_t>::iterator cur = scope->threads.begin();

	    if (do_disable(*cur, thr))
		  disabled_myself_flag = true;
      }

      return ! disabled_myself_flag;
}

/*
 * Implement the %disable/fork (SystemVerilog) instruction by disabling
 * all the detached children of the given thread.
 */
bool of_DISABLE_FORK(vthread_t thr, vvp_code_t)
{
	/* If a %disable/fork is being executed then the parent thread
	 * cannot be waiting in a join. */
      assert(! thr->i_am_joining);

	/* There should be no active children to disable. */
      assert(thr->children.empty());

	/* Disable any detached children. */
      while (! thr->detached_children.empty()) {
	    vthread_t child = *(thr->detached_children.begin());
	    assert(child);
	    assert(child->parent == thr);
	      /* Disabling the children can never match the parent thread. */
	    bool res = do_disable(child, thr);
	    assert(! res);
	    vthread_reap(child);
      }

      return true;
}

/*
 * This function divides a 2-word number {high, a} by a 1-word
 * number. Assume that high < b.
 */
static unsigned long divide2words(unsigned long a, unsigned long b,
				  unsigned long high)
{
      unsigned long result = 0;
      while (high > 0) {
	    unsigned long tmp_result = ULONG_MAX / b;
	    unsigned long remain = ULONG_MAX % b;

	    remain += 1;
	    if (remain >= b) {
		  remain -= b;
		  tmp_result += 1;
	    }

	      // Now 0x1_0...0 = b*tmp_result + remain
	      // high*0x1_0...0 = high*(b*tmp_result + remain)
	      // high*0x1_0...0 = high*b*tmp_result + high*remain

	      // We know that high*0x1_0...0 >= high*b*tmp_result, and
	      // we know that high*0x1_0...0 > high*remain. Use
	      // high*remain as the remainder for another iteration,
	      // and add tmp_result*high into the current estimate of
	      // the result.
	    result += tmp_result * high;

	      // The new iteration starts with high*remain + a.
	    remain = multiply_with_carry(high, remain, high);
	    a += remain;
            if(a < remain)
              high += 1;

	      // Now result*b + {high,a} == the input {high,a}. It is
	      // possible that the new high >= 1. If so, it will
	      // certainly be less than high from the previous
	      // iteration. Do another iteration and it will shrink,
	      // eventually to 0.
      }

	// high is now 0, so a is the remaining remainder, so we can
	// finish off the integer divide with a simple a/b.

      return result + a/b;
}

static unsigned long* divide_bits(unsigned long*ap, unsigned long*bp, unsigned wid)
{
	// Do all our work a cpu-word at a time. The "words" variable
	// is the number of words of the wid.
      unsigned words = (wid+CPU_WORD_BITS-1) / CPU_WORD_BITS;

      unsigned btop = words-1;
      while (btop > 0 && bp[btop] == 0)
	    btop -= 1;

	// Detect divide by 0, and exit.
      if (btop==0 && bp[0]==0)
	    return 0;

	// The result array will eventually accumulate the result. The
	// diff array is a difference that we use in the intermediate.
      unsigned long*diff  = new unsigned long[words];
      unsigned long*result= new unsigned long[words];
      for (unsigned idx = 0 ; idx < words ; idx += 1)
	    result[idx] = 0;

      for (unsigned cur = words-btop ; cur > 0 ; cur -= 1) {
	    unsigned cur_ptr = cur-1;
	    unsigned long cur_res;
	    if (ap[cur_ptr+btop] >= bp[btop]) {
		  unsigned long high = 0;
		  if (cur_ptr+btop+1 < words)
			high = ap[cur_ptr+btop+1];
		  cur_res = divide2words(ap[cur_ptr+btop], bp[btop], high);

	    } else if (cur_ptr+btop+1 >= words) {
		  continue;

	    } else if (ap[cur_ptr+btop+1] == 0) {
		  continue;

	    } else {
		  cur_res = divide2words(ap[cur_ptr+btop], bp[btop],
					 ap[cur_ptr+btop+1]);
	    }

	      // cur_res is a guesstimate of the result this far. It
	      // may be 1 too big. (But it will also be >0) Try it,
	      // and if the difference comes out negative, then adjust.

	      // diff = (bp * cur_res)  << cur_ptr;
	    multiply_array_imm(diff+cur_ptr, bp, words-cur_ptr, cur_res);
	      // ap -= diff
	    unsigned long carry = 1;
	    for (unsigned idx = cur_ptr ; idx < words ; idx += 1)
		  ap[idx] = add_with_carry(ap[idx], ~diff[idx], carry);

	      // ap has the diff subtracted out of it. If cur_res was
	      // too large, then ap will turn negative. (We easily
	      // tell that ap turned negative by looking at
	      // carry&1. If it is 0, then it is *negative*.) In that
	      // case, we know that cur_res was too large by 1. Correct by
	      // adding 1b back in and reducing cur_res.
	    if ((carry&1) == 0) {
		    // Keep adding b back in until the remainder
		    // becomes positive again.
		  do {
			cur_res -= 1;
			carry = 0;
			for (unsigned idx = cur_ptr ; idx < words ; idx += 1)
			      ap[idx] = add_with_carry(ap[idx], bp[idx-cur_ptr], carry);
		  } while (carry == 0);
	    }

	    result[cur_ptr] = cur_res;
      }

	// Now ap contains the remainder and result contains the
	// desired result. We should find that:
	//  input-a = bp * result + ap;

      delete[]diff;
      return result;
}

/*
 * %div
 */
bool of_DIV(vthread_t thr, vvp_code_t)
{
      vvp_vector4_t valb = thr->pop_vec4();
      vvp_vector4_t vala = thr->pop_vec4();

      assert(vala.size()== valb.size());
      unsigned wid = vala.size();

      unsigned long*ap = vala.subarray(0, wid);
      if (ap == 0) {
	    vvp_vector4_t tmp(wid, BIT4_X);
	    thr->push_vec4(tmp);
	    return true;
      }

      unsigned long*bp = valb.subarray(0, wid);
      if (bp == 0) {
	    delete[]ap;
	    vvp_vector4_t tmp(wid, BIT4_X);
	    thr->push_vec4(tmp);
	    return true;
      }

	// If the value fits in a single CPU word, then do it the easy way.
      if (wid <= CPU_WORD_BITS) {
	    if (bp[0] == 0) {
		  vvp_vector4_t tmp(wid, BIT4_X);
		  thr->push_vec4(tmp);
	    } else {
		  ap[0] /= bp[0];
		  vala.setarray(0, wid, ap);
		  thr->push_vec4(vala);
	    }
	    delete[]ap;
	    delete[]bp;
	    return true;
      }

      unsigned long*result = divide_bits(ap, bp, wid);
      if (result == 0) {
	    delete[]ap;
	    delete[]bp;
	    vvp_vector4_t tmp(wid, BIT4_X);
	    thr->push_vec4(tmp);
	    return true;
      }

	// Now ap contains the remainder and result contains the
	// desired result. We should find that:
	//  input-a = bp * result + ap;

      vala.setarray(0, wid, result);
      thr->push_vec4(vala);
      delete[]ap;
      delete[]bp;
      delete[]result;

      return true;
}


static void negate_words(unsigned long*val, unsigned words)
{
      unsigned long carry = 1;
      for (unsigned idx = 0 ; idx < words ; idx += 1)
	    val[idx] = add_with_carry(0, ~val[idx], carry);
}

/*
 * %div/s
 */
bool of_DIV_S(vthread_t thr, vvp_code_t)
{
      vvp_vector4_t valb = thr->pop_vec4();
      vvp_vector4_t&vala = thr->peek_vec4();

      assert(vala.size()== valb.size());
      unsigned wid = vala.size();
      unsigned words = (wid + CPU_WORD_BITS - 1) / CPU_WORD_BITS;

	// Get the values, left in right, in binary form. If there is
	// a problem with either (caused by an X or Z bit) then we
	// know right away that the entire result is X.
      unsigned long*ap = vala.subarray(0, wid);
      if (ap == 0) {
	    vvp_vector4_t tmp(wid, BIT4_X);
	    vala = tmp;
	    return true;
      }

      unsigned long*bp = valb.subarray(0, wid);
      if (bp == 0) {
	    delete[]ap;
	    vvp_vector4_t tmp(wid, BIT4_X);
	    vala = tmp;
	    return true;
      }

	// Sign extend the bits in the array to fill out the array.
      unsigned long sign_mask = 0;
      if (unsigned long sign_bits = (words*CPU_WORD_BITS) - wid) {
	    sign_mask = -1UL << (CPU_WORD_BITS-sign_bits);
	    if (ap[words-1] & (sign_mask>>1))
		  ap[words-1] |= sign_mask;
	    if (bp[words-1] & (sign_mask>>1))
		  bp[words-1] |= sign_mask;
      }

	// If the value fits in a single word, then use the native divide.
      if (wid <= CPU_WORD_BITS) {
	    if (bp[0] == 0) {
		  vvp_vector4_t tmp(wid, BIT4_X);
		  vala = tmp;
	    } else {
		  long tmpa = (long) ap[0];
		  long tmpb = (long) bp[0];
		  long res = tmpa / tmpb;
		  ap[0] = ((unsigned long)res) & ~sign_mask;
		  vala.setarray(0, wid, ap);
	    }
	    delete[]ap;
	    delete[]bp;
	    return true;
      }

	// We need to the actual division to positive integers. Make
	// them positive here, and remember the negations.
      bool negate_flag = false;
      if ( ((long) ap[words-1]) < 0 ) {
	    negate_flag = true;
	    negate_words(ap, words);
      }
      if ( ((long) bp[words-1]) < 0 ) {
	    negate_flag ^= true;
	    negate_words(bp, words);
      }

      unsigned long*result = divide_bits(ap, bp, wid);
      if (result == 0) {
	    delete[]ap;
	    delete[]bp;
	    vvp_vector4_t tmp(wid, BIT4_X);
	    vala = tmp;
	    return true;
      }

      if (negate_flag) {
	    negate_words(result, words);
      }

      result[words-1] &= ~sign_mask;

      vala.setarray(0, wid, result);
      delete[]ap;
      delete[]bp;
      delete[]result;
      return true;
}

bool of_DIV_WR(vthread_t thr, vvp_code_t)
{
      double r = thr->pop_real();
      double l = thr->pop_real();
      thr->push_real(l / r);

      return true;
}

bool of_DUP_REAL(vthread_t thr, vvp_code_t)
{
      thr->push_real(thr->peek_real(0));
      return true;
}

bool of_DUP_VEC4(vthread_t thr, vvp_code_t)
{
      thr->push_vec4(thr->peek_vec4(0));
      return true;
}

/*
 * This terminates the current thread. If there is a parent who is
 * waiting for me to die, then I schedule it. At any rate, I mark
 * myself as a zombie by setting my pc to 0.
 */
bool of_END(vthread_t thr, vvp_code_t)
{
      assert(! thr->waiting_for_event);
      thr->i_have_ended = 1;
      thr->pc = codespace_null();

	/* Fully detach any detached children. */
      while (! thr->detached_children.empty()) {
	    vthread_t child = *(thr->detached_children.begin());
	    assert(child);
	    assert(child->parent == thr);
	    assert(child->i_am_detached);
	    child->parent = 0;
	    child->i_am_detached = 0;
	    thr->detached_children.erase(thr->detached_children.begin());
      }

	/* It is an error to still have active children running at this
	 * point in time. They should have all been detached or joined. */
      assert(thr->children.empty());

	/* If I have a parent who is waiting for me, then mark that I
	   have ended, and schedule that parent. Also, finish the
	   %join for the parent. */
      if (thr->parent && thr->parent->i_am_joining) {
	    vthread_t tmp = thr->parent;
	    assert(! thr->i_am_detached);

	      // Detect that the parent is waiting on a task or function
	      // thread. These threads must be reaped first. If the
	      // parent is waiting on a task or function (other than me)
	      // then go into zombie state to be picked up later.
	    if (! test_joinable(tmp, thr))
		  return false;

	    tmp->i_am_joining = 0;
	    schedule_vthread(tmp, 0, true);
	    do_join(tmp, thr);
	    return false;
      }

	/* If this thread is not fully detached then remove it from the
	 * parents detached_children set and reap it. */
      if (thr->i_am_detached) {
	    vthread_t tmp = thr->parent;
	    assert(tmp);
	    size_t res = tmp->detached_children.erase(thr);
	    assert(res == 1);
	      /* If the parent is waiting for the detached children to
	       * finish then the last detached child needs to tell the
	       * parent to wake up when it is finished. */
	    if (tmp->i_am_waiting && tmp->detached_children.empty()) {
		  tmp->i_am_waiting = 0;
		  schedule_vthread(tmp, 0, true);
	    }
	      /* Fully detach this thread so it will be reaped below. */
	    thr->i_am_detached = 0;
	    thr->parent = 0;
      }

	/* If I have no parent, then no one can %join me and there is
	 * no reason to stick around. This can happen, for example if
	 * I am an ``initial'' thread. */
      if (thr->parent == 0) {
	    vthread_reap(thr);
	    return false;
      }

	/* If I make it this far, then I have a parent who may wish
	   to %join me. Remain a zombie so that it can. */

      return false;
}

/*
 * %event <var-label>
 */
bool of_EVENT(vthread_t thr, vvp_code_t cp)
{
      vvp_net_ptr_t ptr (cp->net, 0);
      vvp_vector4_t tmp (1, BIT4_X);
      vvp_send_vec4(ptr, tmp, thr->wt_context);
      return true;
}

bool of_EVCTL(vthread_t thr, vvp_code_t cp)
{
      assert(thr->event == 0 && thr->ecount == 0);
      thr->event = cp->net;
      thr->ecount = thr->words[cp->bit_idx[0]].w_uint;
      return true;
}
bool of_EVCTLC(vthread_t thr, vvp_code_t)
{
      thr->event = 0;
      thr->ecount = 0;
      return true;
}

bool of_EVCTLI(vthread_t thr, vvp_code_t cp)
{
      assert(thr->event == 0 && thr->ecount == 0);
      thr->event = cp->net;
      thr->ecount = cp->bit_idx[0];
      return true;
}

bool of_EVCTLS(vthread_t thr, vvp_code_t cp)
{
      assert(thr->event == 0 && thr->ecount == 0);
      thr->event = cp->net;
      int64_t val = thr->words[cp->bit_idx[0]].w_int;
      if (val < 0) val = 0;
      thr->ecount = val;
      return true;
}

bool of_FLAG_GET_VEC4(vthread_t thr, vvp_code_t cp)
{
      int flag = cp->number;
      assert(flag < vthread_s::FLAGS_COUNT);

      vvp_vector4_t val (1, thr->flags[flag]);
      thr->push_vec4(val);

      return true;
}

/*
 * %flag_inv <flag1>
 */
bool of_FLAG_INV(vthread_t thr, vvp_code_t cp)
{
      int flag1 = cp->bit_idx[0];

      thr->flags[flag1] = ~ thr->flags[flag1];
      return true;
}

/*
 * %flag_mov <flag1>, <flag2>
 */
bool of_FLAG_MOV(vthread_t thr, vvp_code_t cp)
{
      int flag1 = cp->bit_idx[0];
      int flag2 = cp->bit_idx[1];

      thr->flags[flag1] = thr->flags[flag2];
      return true;
}

/*
 * %flag_or <flag1>, <flag2>
 */
bool of_FLAG_OR(vthread_t thr, vvp_code_t cp)
{
      int flag1 = cp->bit_idx[0];
      int flag2 = cp->bit_idx[1];

      thr->flags[flag1] = thr->flags[flag1] | thr->flags[flag2];
      return true;
}

bool of_FLAG_SET_IMM(vthread_t thr, vvp_code_t cp)
{
      int flag = cp->number;
      int vali = cp->bit_idx[0];

      assert(flag < vthread_s::FLAGS_COUNT);
      assert(vali >= 0 && vali < 4);

      static vvp_bit4_t map_bit[4] = {BIT4_0, BIT4_1, BIT4_Z, BIT4_X};
      thr->flags[flag] = map_bit[vali];
      return true;
}

bool of_FLAG_SET_VEC4(vthread_t thr, vvp_code_t cp)
{
      int flag = cp->number;
      assert(flag < vthread_s::FLAGS_COUNT);

      const vvp_vector4_t&val = thr->peek_vec4();
      thr->flags[flag] = val.value(0);
      thr->pop_vec4(1);

      return true;
}

/*
 * the %force/link instruction connects a source node to a
 * destination node. The destination node must be a signal, as it is
 * marked with the source of the force so that it may later be
 * unlinked without specifically knowing the source that this
 * instruction used.
 */
bool of_FORCE_LINK(vthread_t, vvp_code_t cp)
{
      vvp_net_t*dst = cp->net;
      vvp_net_t*src = cp->net2;

      assert(dst->fil);
      dst->fil->force_link(dst, src);

      return true;
}

/*
 * The %force/vec4 instruction invokes a force assign of a constant value
 * to a signal. The instruction arguments are:
 *
 *     %force/vec4 <net> ;
 *
 * where the <net> is the net label assembled into a vvp_net pointer,
 * and the value to be forced is popped from the vec4 stack.\.
 *
 * The instruction writes a vvp_vector4_t value to port-2 of the
 * target signal.
 */
bool of_FORCE_VEC4(vthread_t thr, vvp_code_t cp)
{
      vvp_net_t*net = cp->net;

      vvp_vector4_t value = thr->pop_vec4();

	/* Send the force value to the filter on the node. */

      assert(net->fil);
      if (value.size() != net->fil->filter_size())
	    value = coerce_to_width(value, net->fil->filter_size());

      net->force_vec4(value, vvp_vector2_t(vvp_vector2_t::FILL1, net->fil->filter_size()));

      return true;
}

/*
 * %force/vec4/off <net>, <off>
 */
bool of_FORCE_VEC4_OFF(vthread_t thr, vvp_code_t cp)
{
      vvp_net_t*net = cp->net;
      unsigned base_idx = cp->bit_idx[0];
      long base = thr->words[base_idx].w_int;
      vvp_vector4_t value = thr->pop_vec4();
      unsigned wid = value.size();

      assert(net->fil);

      if (thr->flags[4] == BIT4_1)
	    return true;

	// This is the width of the target vector.
      unsigned use_size = net->fil->filter_size();

      if (base >= (long)use_size)
	    return true;
      if (base < -(long)use_size)
	    return true;

      if ((base + wid) > use_size)
	    wid = use_size - base;

	// Make a mask of which bits are to be forced, 0 for unforced
	// bits and 1 for forced bits.
      vvp_vector2_t mask (vvp_vector2_t::FILL0, use_size);
      for (unsigned idx = 0 ; idx < wid ; idx += 1)
	    mask.set_bit(base+idx, 1);

      vvp_vector4_t tmp (use_size, BIT4_Z);

	// vvp_net_t::force_vec4 propagates all the bits of the
	// forced vector value, regardless of the mask. This
	// ensures the unforced bits retain their current value.
      vvp_signal_value*sig = dynamic_cast<vvp_signal_value*>(net->fil);
      assert(sig);
      sig->vec4_value(tmp);

      tmp.set_vec(base, value);

      net->force_vec4(tmp, mask);
      return true;
}

bool of_FORCE_WR(vthread_t thr, vvp_code_t cp)
{
      vvp_net_t*net  = cp->net;
      double value = thr->pop_real();

      net->force_real(value, vvp_vector2_t(vvp_vector2_t::FILL1, 1));

      return true;
}

/*
 * The %fork instruction causes a new child to be created and pushed
 * in front of any existing child. This causes the new child to be
 * added to the list of children, and for me to be the parent of the
 * new child.
 */
bool of_FORK(vthread_t thr, vvp_code_t cp)
{
      vthread_t child = vthread_new(cp->cptr2, cp->scope);

      if (cp->scope->is_automatic()) {
              /* The context allocated for this child is the top entry
                 on the write context stack. */
            child->wt_context = thr->wt_context;
            child->rd_context = thr->wt_context;
      }

      child->parent = thr;
      thr->children.insert(child);

      switch (cp->scope->get_type_code()) {
          case vpiFunction:
	      // Functions should be started by the %callf opcodes, and
	      // NOT by the %fork instruction
	    assert(0);
          case vpiTask:
	    thr->task_func_children.insert(child);
	    break;
          default:
	    break;
      }

      if (thr->i_am_in_function) {
	    child->is_scheduled = 1;
	    child->i_am_in_function = 1;
	    vthread_run(child);
	    running_thread = thr;
      } else {
	    schedule_vthread(child, 0, true);
      }
      return true;
}

bool of_FREE(vthread_t thr, vvp_code_t cp)
{
        /* Pop the child context from the read context stack. */
      vvp_context_t child_context = thr->rd_context;
      thr->rd_context = vvp_get_stacked_context(child_context);

        /* Free the context. */
      vthread_free_context(child_context, cp->scope);

      return true;
}

/*
 * %inv
 *
 * Logically, this pops a value, inverts is (Verilog style, with Z and
 * X converted to X) and pushes the result. We can more efficiently
 * just to the invert in place.
 */
bool of_INV(vthread_t thr, vvp_code_t)
{
      vvp_vector4_t&val = thr->peek_vec4();
      val.invert();
      return true;
}


/*
 * Index registers, arithmetic.
 */

static inline int64_t get_as_64_bit(uint32_t low_32, uint32_t high_32)
{
      int64_t low = low_32;
      int64_t res = high_32;

      res <<= 32;
      res |= low;
      return res;
}

bool of_IX_ADD(vthread_t thr, vvp_code_t cp)
{
      thr->words[cp->number].w_int += get_as_64_bit(cp->bit_idx[0],
                                                    cp->bit_idx[1]);
      return true;
}

bool of_IX_SUB(vthread_t thr, vvp_code_t cp)
{
      thr->words[cp->number].w_int -= get_as_64_bit(cp->bit_idx[0],
                                                    cp->bit_idx[1]);
      return true;
}

bool of_IX_MUL(vthread_t thr, vvp_code_t cp)
{
      thr->words[cp->number].w_int *= get_as_64_bit(cp->bit_idx[0],
                                                    cp->bit_idx[1]);
      return true;
}

bool of_IX_LOAD(vthread_t thr, vvp_code_t cp)
{
      thr->words[cp->number].w_int = get_as_64_bit(cp->bit_idx[0],
                                                   cp->bit_idx[1]);
      return true;
}

bool of_IX_MOV(vthread_t thr, vvp_code_t cp)
{
      thr->words[cp->bit_idx[0]].w_int = thr->words[cp->bit_idx[1]].w_int;
      return true;
}

bool of_IX_GETV(vthread_t thr, vvp_code_t cp)
{
      unsigned index = cp->bit_idx[0];
      vvp_net_t*net = cp->net;

      vvp_signal_value*sig = dynamic_cast<vvp_signal_value*>(net->fil);
      if (sig == 0) {
	    assert(net->fil);
	    cerr << "%%ix/getv error: Net arg not a vector signal? "
		 << typeid(*net->fil).name() << endl;
      }
      assert(sig);

      vvp_vector4_t vec;
      sig->vec4_value(vec);
      uint64_t val;
      bool known_flag = vector4_to_value(vec, val);

      if (known_flag)
	    thr->words[index].w_uint = val;
      else
	    thr->words[index].w_uint = 0;

	/* Set bit 4 as a flag if the input is unknown. */
      thr->flags[4] = known_flag ? BIT4_0 : BIT4_1;

      return true;
}

bool of_IX_GETV_S(vthread_t thr, vvp_code_t cp)
{
      unsigned index = cp->bit_idx[0];
      vvp_net_t*net = cp->net;

      vvp_signal_value*sig = dynamic_cast<vvp_signal_value*>(net->fil);
      if (sig == 0) {
	    assert(net->fil);
	    cerr << "%%ix/getv/s error: Net arg not a vector signal? "
		 << "fun=" << typeid(*net->fil).name()
		 << ", fil=" << (net->fil? typeid(*net->fil).name() : "<>")
		 << endl;
      }
      assert(sig);

      vvp_vector4_t vec;
      sig->vec4_value(vec);
      int64_t val;
      bool known_flag = vector4_to_value(vec, val, true, true);

      if (known_flag)
	    thr->words[index].w_int = val;
      else
	    thr->words[index].w_int = 0;

	/* Set bit 4 as a flag if the input is unknown. */
      thr->flags[4] = known_flag? BIT4_0 : BIT4_1;

      return true;
}

static uint64_t vec4_to_index(vthread_t thr, bool signed_flag)
{
	// Get all the information we need about the vec4 vector, then
	// pop it away. We only need the bool bits and the length.
      const vvp_vector4_t&val = thr->peek_vec4();
      unsigned val_size = val.size();
      unsigned long*bits = val.subarray(0, val_size, false);
      thr->pop_vec4(1);

	// If there are X/Z bits, then the subarray will give us a nil
	// pointer. Set a flag to indicate the error, and give up.
      if (bits == 0) {
	    thr->flags[4] = BIT4_1;
	    return 0;
      }

      uint64_t v = 0;
      thr->flags[4] = BIT4_0;

      assert(sizeof(bits[0]) <= sizeof(v));
	//assert(val_size <= 8*sizeof(v));

      v = 0;
      for (unsigned idx = 0 ; idx < val_size ; idx += 8*sizeof(bits[0])) {
	    uint64_t tmp = bits[idx/8/sizeof(bits[0])];
	    v |= tmp << idx;
      }

	// Set the high bits that are not necessarily filled in by the
	// subarray function.
      if (val_size < 8*sizeof(v)) {
	    if (signed_flag && (v & (static_cast<uint64_t>(1)<<(val_size-1)))) {
		    // Propagate the sign bit...
		  v |= (~static_cast<uint64_t>(0)) << val_size;

	    } else {
		    // Fill with zeros.
		  v &= ~((~static_cast<uint64_t>(0)) << val_size);
	    }

      }

      delete[]bits;
      return v;
}

/*
 * %ix/vec4 <idx>
 */
bool of_IX_VEC4(vthread_t thr, vvp_code_t cp)
{
      unsigned use_idx = cp->number;
      thr->words[use_idx].w_uint = vec4_to_index(thr, false);
      return true;
}

/*
 * %ix/vec4/s <idx>
 */
bool of_IX_VEC4_S(vthread_t thr, vvp_code_t cp)
{
      unsigned use_idx = cp->number;
      thr->words[use_idx].w_uint = vec4_to_index(thr, true);
      return true;
}

/*
 * The various JMP instruction work simply by pulling the new program
 * counter from the instruction and resuming. If the jump is
 * conditional, then test the bit for the expected value first.
 */
bool of_JMP(vthread_t thr, vvp_code_t cp)
{
      thr->pc = cp->cptr;

	/* Normally, this returns true so that the processor just
	   keeps going to the next instruction. However, if there was
	   a $stop or vpiStop, returning false here can break the
	   simulation out of a hung loop. */
      if (schedule_stopped()) {
	    schedule_vthread(thr, 0, false);
	    return false;
      }

      return true;
}

/*
 * %jmp/0 <pc>, <flag>
 */
bool of_JMP0(vthread_t thr, vvp_code_t cp)
{
      if (thr->flags[cp->bit_idx[0]] == BIT4_0)
	    thr->pc = cp->cptr;

	/* Normally, this returns true so that the processor just
	   keeps going to the next instruction. However, if there was
	   a $stop or vpiStop, returning false here can break the
	   simulation out of a hung loop. */
      if (schedule_stopped()) {
	    schedule_vthread(thr, 0, false);
	    return false;
      }

      return true;
}

/*
 * %jmp/0xz <pc>, <flag>
 */
bool of_JMP0XZ(vthread_t thr, vvp_code_t cp)
{
      if (thr->flags[cp->bit_idx[0]] != BIT4_1)
	    thr->pc = cp->cptr;

	/* Normally, this returns true so that the processor just
	   keeps going to the next instruction. However, if there was
	   a $stop or vpiStop, returning false here can break the
	   simulation out of a hung loop. */
      if (schedule_stopped()) {
	    schedule_vthread(thr, 0, false);
	    return false;
      }

      return true;
}

/*
 * %jmp/1 <pc>, <flag>
 */
bool of_JMP1(vthread_t thr, vvp_code_t cp)
{
      if (thr->flags[cp->bit_idx[0]] == BIT4_1)
	    thr->pc = cp->cptr;

	/* Normally, this returns true so that the processor just
	   keeps going to the next instruction. However, if there was
	   a $stop or vpiStop, returning false here can break the
	   simulation out of a hung loop. */
      if (schedule_stopped()) {
	    schedule_vthread(thr, 0, false);
	    return false;
      }

      return true;
}

/*
 * %jmp/1xz <pc>, <flag>
 */
bool of_JMP1XZ(vthread_t thr, vvp_code_t cp)
{
      if (thr->flags[cp->bit_idx[0]] != BIT4_0)
	    thr->pc = cp->cptr;

	/* Normally, this returns true so that the processor just
	   keeps going to the next instruction. However, if there was
	   a $stop or vpiStop, returning false here can break the
	   simulation out of a hung loop. */
      if (schedule_stopped()) {
	    schedule_vthread(thr, 0, false);
	    return false;
      }

      return true;
}

/*
 * The %join instruction causes the thread to wait for one child
 * to die.  If a child is already dead (and a zombie) then I reap
 * it and go on. Otherwise, I mark myself as waiting in a join so that
 * children know to wake me when they finish.
 */

static bool test_joinable(vthread_t thr, vthread_t child)
{
      set<vthread_t>::iterator cur = thr->task_func_children.find(child);
      if (! thr->task_func_children.empty() && cur == thr->task_func_children.end())
	    return false;

      return true;
}

static void do_join(vthread_t thr, vthread_t child)
{
      assert(child->parent == thr);

	/* Remove the thread from the task/function set if needed. */
      thr->task_func_children.erase(child);

        /* If the immediate child thread is in an automatic scope... */
      if (child->wt_context) {
              /* and is the top level task/function thread... */
            if (thr->wt_context != thr->rd_context) {
                    /* Pop the child context from the write context stack. */
                  vvp_context_t child_context = thr->wt_context;
                  thr->wt_context = vvp_get_stacked_context(child_context);

                    /* Push the child context onto the read context stack */
                  vvp_set_stacked_context(child_context, thr->rd_context);
                  thr->rd_context = child_context;
            }
      }

      vthread_reap(child);
}

static bool do_join_opcode(vthread_t thr)
{
      assert( !thr->i_am_joining );
      assert( !thr->children.empty());

	// Are there any children that have already ended? If so, then
	// join with that one.
      for (set<vthread_t>::iterator cur = thr->children.begin()
		 ; cur != thr->children.end() ; ++cur) {
	    vthread_t curp = *cur;
	    if (! curp->i_have_ended)
		  continue;

	    if (! test_joinable(thr, curp))
		  continue;

	      // found something!
	    do_join(thr, curp);
	    return true;
      }

	// Otherwise, tell my children to awaken me when they end,
	// then pause.
      thr->i_am_joining = 1;
      return false;
}

bool of_JOIN(vthread_t thr, vvp_code_t)
{
      return do_join_opcode(thr);
}

/*
 * This %join/detach <n> instruction causes the thread to detach
 * threads that were created by an earlier %fork.
 */
bool of_JOIN_DETACH(vthread_t thr, vvp_code_t cp)
{
      unsigned long count = cp->number;

      assert(thr->task_func_children.empty());
      assert(count == thr->children.size());

      while (! thr->children.empty()) {
	    vthread_t child = *thr->children.begin();
	    assert(child->parent == thr);

	      // We cannot detach automatic tasks/functions within an
	      // automatic scope. If we try to do that, we might make
	      // a mess of the allocation of the context. Note that it
	      // is OK if the child context is distinct (See %exec_ufunc.)
	    assert(child->wt_context==0 || thr->wt_context!=child->wt_context);
	    if (child->i_have_ended) {
		    // If the child has already ended, then reap it.
		  vthread_reap(child);

	    } else {
		  size_t res = child->parent->children.erase(child);
		  assert(res == 1);
		  child->i_am_detached = 1;
		  thr->detached_children.insert(child);
	    }
      }

      return true;
}

/*
 * %load/ar <array-label>, <index>;
*/
bool of_LOAD_AR(vthread_t thr, vvp_code_t cp)
{
      unsigned idx = cp->bit_idx[0];
      unsigned adr = thr->words[idx].w_int;
      double word;

	/* The result is 0.0 if the address is undefined. */
      if (thr->flags[4] == BIT4_1) {
	    word = 0.0;
      } else {
	    word = cp->array->get_word_r(adr);
      }

      thr->push_real(word);
      return true;
}

/*
 * %load/dar/r <array-label>;
 */
bool of_LOAD_DAR_R(vthread_t thr, vvp_code_t cp)
{
      unsigned adr = thr->words[3].w_int;
      vvp_net_t*net = cp->net;

      assert(net);
      vvp_fun_signal_object*obj = dynamic_cast<vvp_fun_signal_object*> (net->fun);
      assert(obj);

      vvp_darray*darray = obj->get_object().peek<vvp_darray>();
      assert(darray);

      double word;
      darray->get_word(adr, word);

      thr->push_real(word);
      return true;
}

bool of_LOAD_DAR_STR(vthread_t thr, vvp_code_t cp)
{
      unsigned adr = thr->words[3].w_int;
      vvp_net_t*net = cp->net;

      assert(net);
      vvp_fun_signal_object*obj = dynamic_cast<vvp_fun_signal_object*> (net->fun);
      assert(obj);

      vvp_darray*darray = obj->get_object().peek<vvp_darray>();
      assert(darray);

      string word;
      darray->get_word(adr, word);
      thr->push_str(word);

      return true;
}

bool of_LOAD_DAR_VEC4(vthread_t thr, vvp_code_t cp)
{
      unsigned adr = thr->words[3].w_int;
      vvp_net_t*net = cp->net;

      assert(net);
      vvp_fun_signal_object*obj = dynamic_cast<vvp_fun_signal_object*> (net->fun);
      assert(obj);

      vvp_darray*darray = obj->get_object().peek<vvp_darray>();
      assert(darray);

      vvp_vector4_t word;
      darray->get_word(adr, word);
      thr->push_vec4(word);

      return true;
}

/*
 * %load/obj <var-label>
 */
bool of_LOAD_OBJ(vthread_t thr, vvp_code_t cp)
{
      vvp_net_t*net = cp->net;
      vvp_fun_signal_object*fun = dynamic_cast<vvp_fun_signal_object*> (net->fun);
      assert(fun);

      vvp_object_t val = fun->get_object();
      thr->push_object(val);

      return true;
}

/*
 * %load/obja <index>
 *    Loads the object from array, using index <index> as the index
 *    value. If flags[4] == 1, the calculation of <index> may have
 *    failed, so push nil.
 */
bool of_LOAD_OBJA(vthread_t thr, vvp_code_t cp)
{
      unsigned idx = cp->bit_idx[0];
      unsigned adr = thr->words[idx].w_int;
      vvp_object_t word;

	/* The result is 0.0 if the address is undefined. */
      if (thr->flags[4] == BIT4_1) {
	    ; // Return nil
      } else {
	    cp->array->get_word_obj(adr, word);
      }

      thr->push_object(word);
      return true;
}

/*
 * %load/real <var-label>
 */
bool of_LOAD_REAL(vthread_t thr, vvp_code_t cp)
{
      __vpiHandle*tmp = cp->handle;
      t_vpi_value val;

      val.format = vpiRealVal;
      vpi_get_value(tmp, &val);

      thr->push_real(val.value.real);

      return true;
}

/*
 * %load/str <var-label>
 */
bool of_LOAD_STR(vthread_t thr, vvp_code_t cp)
{
      vvp_net_t*net = cp->net;


      vvp_fun_signal_string*fun = dynamic_cast<vvp_fun_signal_string*> (net->fun);
      assert(fun);

      const string&val = fun->get_string();
      thr->push_str(val);

      return true;
}

bool of_LOAD_STRA(vthread_t thr, vvp_code_t cp)
{
      unsigned idx = cp->bit_idx[0];
      unsigned adr = thr->words[idx].w_int;
      string word;

      if (thr->flags[4] == BIT4_1) {
	    word = "";
      } else {
	    word = cp->array->get_word_str(adr);
      }

      thr->push_str(word);
      return true;
}


/*
 * %load/vec4 <net>
 */
bool of_LOAD_VEC4(vthread_t thr, vvp_code_t cp)
{
	// Push a placeholder onto the stack in order to reserve the
	// stack space. Use a reference for the stack top as a target
	// for the load.
      thr->push_vec4(vvp_vector4_t());
      vvp_vector4_t&sig_value = thr->peek_vec4();

      vvp_net_t*net = cp->net;

	// For the %load to work, the functor must actually be a
	// signal functor. Only signals save their vector value.
      vvp_signal_value*sig = dynamic_cast<vvp_signal_value*> (net->fil);
      if (sig == 0) {
	    cerr << "%load/v error: Net arg not a signal? "
		 << (net->fil ? typeid(*net->fil).name() : typeid(*net->fun).name()) << endl;
	    assert(sig);
      }

	// Extract the value from the signal and directly into the
	// target stack position.
      sig->vec4_value(sig_value);

      return true;
}

/*
 * %load/vec4a <arr>, <adrx>
 */
bool of_LOAD_VEC4A(vthread_t thr, vvp_code_t cp)
{
      int adr_index = cp->bit_idx[0];

      long adr = thr->words[adr_index].w_int;

	// If flag[3] is set, then the calculation of the address
	// failed, and this load should return X instead of the actual
	// value.
      if (thr->flags[4] == BIT4_1) {
	    vvp_vector4_t tmp (cp->array->get_word_size(), BIT4_X);
	    thr->push_vec4(tmp);
	    return true;
      }

      vvp_vector4_t tmp (cp->array->get_word(adr));
      thr->push_vec4(tmp);
      return true;
}

static void do_verylong_mod(vvp_vector4_t&vala, const vvp_vector4_t&valb,
			    bool left_is_neg, bool right_is_neg)
{
      bool out_is_neg = left_is_neg;
      const int len=vala.size();
      unsigned char *a, *z, *t;
      a = new unsigned char[len+1];
      z = new unsigned char[len+1];
      t = new unsigned char[len+1];

      unsigned char carry;
      unsigned char temp;

      int mxa = -1, mxz = -1;
      int i;
      int current, copylen;

      unsigned lb_carry = left_is_neg? 1 : 0;
      unsigned rb_carry = right_is_neg? 1 : 0;
      for (int idx = 0 ;  idx < len ;  idx += 1) {
	    unsigned lb = vala.value(idx);
	    unsigned rb = valb.value(idx);

	    if ((lb | rb) & 2) {
		  delete []t;
		  delete []z;
		  delete []a;
		  vvp_vector4_t tmp(len, BIT4_X);
		  vala = tmp;
		  return;
	    }

	    if (left_is_neg) {
		  lb = (1-lb) + lb_carry;
		  lb_carry = (lb & ~1)? 1 : 0;
		  lb &= 1;
	    }
	    if (right_is_neg) {
		  rb = (1-rb) + rb_carry;
		  rb_carry = (rb & ~1)? 1 : 0;
		  rb &= 1;
	    }

	    z[idx]=lb;
	    a[idx]=1-rb;	// for 2s complement add..
      }

      z[len]=0;
      a[len]=1;

      for(i=len-1;i>=0;i--) {
	    if(! a[i]) {
		  mxa=i;
		  break;
	    }
      }

      for(i=len-1;i>=0;i--) {
	    if(z[i]) {
		  mxz=i;
		  break;
	    }
      }

      if((mxa>mxz)||(mxa==-1)) {
	    if(mxa==-1) {
		  delete []t;
		  delete []z;
		  delete []a;
		  vvp_vector4_t tmpx (len, BIT4_X);
		  vala = tmpx;
		  return;
	    }

	    goto tally;
      }

      copylen = mxa + 2;
      current = mxz - mxa;

      while(current > -1) {
	    carry = 1;
	    for(i=0;i<copylen;i++) {
		  temp = z[i+current] + a[i] + carry;
		  t[i] = (temp&1);
		  carry = (temp>>1);
	    }

	    if(carry) {
		  for(i=0;i<copylen;i++) {
			z[i+current] = t[i];
		  }
	    }

	    current--;
      }

 tally:

      vvp_vector4_t tmp (len, BIT4_X);
      carry = out_is_neg? 1 : 0;
      for (int idx = 0 ;  idx < len ;  idx += 1) {
	    unsigned ob = z[idx];
	    if (out_is_neg) {
		  ob = (1-ob) + carry;
		  carry = (ob & ~1)? 1 : 0;
		  ob = ob & 1;
	    }
	    tmp.set_bit(idx, ob?BIT4_1:BIT4_0);
      }
      vala = tmp;
      delete []t;
      delete []z;
      delete []a;
}

bool of_MAX_WR(vthread_t thr, vvp_code_t)
{
      double r = thr->pop_real();
      double l = thr->pop_real();
      if (r != r)
	    thr->push_real(l);
      else if (l != l)
	    thr->push_real(r);
      else if (r < l)
	    thr->push_real(l);
      else
	    thr->push_real(r);
      return true;
}

bool of_MIN_WR(vthread_t thr, vvp_code_t)
{
      double r = thr->pop_real();
      double l = thr->pop_real();
      if (r != r)
	    thr->push_real(l);
      else if (l != l)
	    thr->push_real(r);
      else if (r < l)
	    thr->push_real(r);
      else
	    thr->push_real(l);
      return true;
}

bool of_MOD(vthread_t thr, vvp_code_t)
{
      vvp_vector4_t valb = thr->pop_vec4();
      vvp_vector4_t&vala = thr->peek_vec4();

      assert(vala.size()==valb.size());
      unsigned wid = vala.size();

      if(wid <= 8*sizeof(unsigned long long)) {
	    unsigned long long lv = 0, rv = 0;

	    for (unsigned idx = 0 ;  idx < wid ;  idx += 1) {
		  unsigned long long lb = vala.value(idx);
		  unsigned long long rb = valb.value(idx);

		  if ((lb | rb) & 2)
			goto x_out;

		  lv |= (unsigned long long) lb << idx;
		  rv |= (unsigned long long) rb << idx;
	    }

	    if (rv == 0)
		  goto x_out;

	    lv %= rv;

	    for (unsigned idx = 0 ;  idx < wid ;  idx += 1) {
		  vala.set_bit(idx, (lv&1)?BIT4_1 : BIT4_0);
		  lv >>= 1;
	    }

	    return true;

      } else {
	    do_verylong_mod(vala, valb, false, false);
	    return true;
      }

 x_out:
      vala = vvp_vector4_t(wid, BIT4_X);
      return true;
}

/*
 * %mod/s
 */
bool of_MOD_S(vthread_t thr, vvp_code_t)
{
      vvp_vector4_t valb = thr->pop_vec4();
      vvp_vector4_t&vala = thr->peek_vec4();

      assert(vala.size()==valb.size());
      unsigned wid = vala.size();

	/* Handle the case that we can fit the bits into a long-long
	   variable. We cause use native % to do the work. */
      if(wid <= 8*sizeof(long long)) {
	    long long lv = 0, rv = 0;

	    for (unsigned idx = 0 ;  idx < wid ;  idx += 1) {
		  long long lb = vala.value(idx);
		  long long rb = valb.value(idx);

		  if ((lb | rb) & 2)
			goto x_out;

		  lv |= (long long) lb << idx;
		  rv |= (long long) rb << idx;
	    }

	    if (rv == 0)
		  goto x_out;

	      /* Sign extend the signed operands when needed. */
	    if (wid < 8*sizeof(long long)) {
		  if (lv & (1LL << (wid-1)))
			lv |= -1LL << wid;
		  if (rv & (1LL << (wid-1)))
			rv |= -1LL << wid;
	    }

	    lv %= rv;

	    for (unsigned idx = 0 ;  idx < wid ;  idx += 1) {
		  vala.set_bit(idx, (lv&1)? BIT4_1 : BIT4_0);
		  lv >>= 1;
	    }

	      // vala is the top of the stack, edited in place, so we
	      // do not need to push the result.

	    return true;

      } else {

	    bool left_is_neg  = vala.value(vala.size()-1) == BIT4_1;
	    bool right_is_neg = valb.value(valb.size()-1) == BIT4_1;
	    do_verylong_mod(vala, valb, left_is_neg, right_is_neg);
	    return true;
      }

 x_out:
      vala = vvp_vector4_t(wid, BIT4_X);
      return true;
}

/*
 * %mod/wr
 */
bool of_MOD_WR(vthread_t thr, vvp_code_t)
{
      double r = thr->pop_real();
      double l = thr->pop_real();
      thr->push_real(fmod(l,r));

      return true;
}

/*
 * %pad/s <wid>
 */
bool of_PAD_S(vthread_t thr, vvp_code_t cp)
{
      unsigned wid = cp->number;

      vvp_vector4_t&val = thr->peek_vec4();
      unsigned old_size = val.size();

	// Sign-extend.
      if (old_size < wid)
	    val.resize(wid, val.value(old_size-1));
      else
	    val.resize(wid);

      return true;
}

/*
 * %pad/u <wid>
 */
bool of_PAD_U(vthread_t thr, vvp_code_t cp)
{
      unsigned wid = cp->number;

      vvp_vector4_t&val = thr->peek_vec4();
      val.resize(wid, BIT4_0);

      return true;
}

/*
 * %part/s <wid>
 * %part/u <wid>
 * Two values are popped from the stack. First, pop the canonical
 * index of the part select, and second is the value to be
 * selected. The result is pushed back to the stack.
 */
static bool of_PART_base(vthread_t thr, vvp_code_t cp, bool signed_flag)
{
      unsigned wid = cp->number;

      vvp_vector4_t base4 = thr->pop_vec4();
      vvp_vector4_t&value = thr->peek_vec4();

      vvp_vector4_t res (wid, BIT4_X);

	// NOTE: This is treating the vector as signed. Is that correct?
      int32_t base;
      bool value_ok = vector4_to_value(base4, base, signed_flag);
      if (! value_ok) {
	    value = res;
	    return true;
      }

      if (base >= (int32_t)value.size()) {
	    value = res;
	    return true;
      }

      if ((base+(int)wid) <= 0) {
	    value = res;
	    return true;
      }

      long vbase = 0;
      if (base < 0) {
	    vbase = -base;
	    wid -= vbase;
	    base = 0;
      }

      if ((base+wid) > value.size()) {
	    wid = value.size() - base;
      }

      res .set_vec(vbase, value.subvalue(base, wid));
      value = res;

      return true;
}

bool of_PART_S(vthread_t thr, vvp_code_t cp)
{
      return of_PART_base(thr, cp, true);
}

bool of_PART_U(vthread_t thr, vvp_code_t cp)
{
      return of_PART_base(thr, cp, false);
}

/*
 * %parti/s <wid>, <basei>, <base_wid>
 * %parti/u <wid>, <basei>, <base_wid>
 *
 * Pop the value to be selected. The result is pushed back to the stack.
 */
static bool of_PARTI_base(vthread_t thr, vvp_code_t cp, bool signed_flag)
{
      unsigned wid = cp->number;
      uint32_t base = cp->bit_idx[0];
      uint32_t bwid = cp->bit_idx[1];

      vvp_vector4_t&value = thr->peek_vec4();

      vvp_vector4_t res (wid, BIT4_X);

	// NOTE: This is treating the vector as signed. Is that correct?
      int32_t use_base = base;
      if (signed_flag && bwid < 32 && (base&(1<<(bwid-1)))) {
	    use_base |= (-1) << bwid;
      }

      if (use_base >= (int32_t)value.size()) {
	    value = res;
	    return true;
      }

      if ((use_base+(int32_t)wid) <= 0) {
	    value = res;
	    return true;
      }

      long vbase = 0;
      if (use_base < 0) {
	    vbase = -use_base;
	    wid -= vbase;
	    use_base = 0;
      }

      if ((use_base+wid) > value.size()) {
	    wid = value.size() - use_base;
      }

      res .set_vec(vbase, value.subvalue(use_base, wid));
      value = res;

      return true;
}

bool of_PARTI_S(vthread_t thr, vvp_code_t cp)
{
      return of_PARTI_base(thr, cp, true);
}

bool of_PARTI_U(vthread_t thr, vvp_code_t cp)
{
      return of_PARTI_base(thr, cp, false);
}

/*
*  %mov/wu <dst>, <src>
*/
bool of_MOV_WU(vthread_t thr, vvp_code_t cp)
{
      unsigned dst = cp->bit_idx[0];
      unsigned src = cp->bit_idx[1];

      thr->words[dst].w_uint = thr->words[src].w_uint;
      return true;
}

/*
 * %mul
 */
bool of_MUL(vthread_t thr, vvp_code_t)
{
      vvp_vector4_t r = thr->pop_vec4();
	// Rather then pop l, use it directly from the stack. When we
	// assign to 'l', that will edit the top of the stack, which
	// replaces a pop and a pull.
      vvp_vector4_t&l = thr->peek_vec4();

      l.mul(r);
      return true;
}

/*
 * %muli <vala>, <valb>, <wid>
 *
 * Pop1 operand, get the other operand from the arguments, and push
 * the result.
 */
bool of_MULI(vthread_t thr, vvp_code_t cp)
{
      unsigned wid = cp->number;

      vvp_vector4_t&l = thr->peek_vec4();

	// I expect that most of the bits of an immediate value are
	// going to be zero, so start the result vector with all zero
	// bits. Then we only need to replace the bits that are different.
      vvp_vector4_t r (wid, BIT4_0);
      get_immediate_rval (cp, r);

      l.mul(r);
      return true;
}

bool of_MUL_WR(vthread_t thr, vvp_code_t)
{
      double r = thr->pop_real();
      double l = thr->pop_real();
      thr->push_real(l * r);

      return true;
}

bool of_NAND(vthread_t thr, vvp_code_t)
{
      vvp_vector4_t valr = thr->pop_vec4();
      vvp_vector4_t&vall = thr->peek_vec4();
      assert(vall.size() == valr.size());
      unsigned wid = vall.size();

      for (unsigned idx = 0 ; idx < wid ; idx += 1) {
	    vvp_bit4_t lb = vall.value(idx);
	    vvp_bit4_t rb = valr.value(idx);
	    vall.set_bit(idx, ~(lb&rb));
      }

      return true;
}

/*
 * %new/cobj <vpi_object>
 * This creates a new cobject (SystemVerilog class object) and pushes
 * it to the stack. The <vpi-object> is a __vpiHandle that is a
 * vpiClassDefn object that defines the item to be created.
 */
bool of_NEW_COBJ(vthread_t thr, vvp_code_t cp)
{
      const class_type*defn = dynamic_cast<const class_type*> (cp->handle);
      assert(defn);

      vvp_object_t tmp (new vvp_cobject(defn));
      thr->push_object(tmp);
      return true;
}

bool of_NEW_DARRAY(vthread_t thr, vvp_code_t cp)
{
      const char*text = cp->text;
      size_t size = thr->words[cp->bit_idx[0]].w_int;
      unsigned word_wid;
      size_t n;

      vvp_object_t obj;
      if (strcmp(text,"b8") == 0) {
	    obj = new vvp_darray_atom<uint8_t>(size);
      } else if (strcmp(text,"b16") == 0) {
	    obj = new vvp_darray_atom<uint16_t>(size);
      } else if (strcmp(text,"b32") == 0) {
	    obj = new vvp_darray_atom<uint32_t>(size);
      } else if (strcmp(text,"b64") == 0) {
	    obj = new vvp_darray_atom<uint64_t>(size);
      } else if (strcmp(text,"sb8") == 0) {
	    obj = new vvp_darray_atom<int8_t>(size);
      } else if (strcmp(text,"sb16") == 0) {
	    obj = new vvp_darray_atom<int16_t>(size);
      } else if (strcmp(text,"sb32") == 0) {
	    obj = new vvp_darray_atom<int32_t>(size);
      } else if (strcmp(text,"sb64") == 0) {
	    obj = new vvp_darray_atom<int64_t>(size);
      } else if ((1 == sscanf(text, "b%u%zn", &word_wid, &n)) &&
                 (n == strlen(text))) {
	    obj = new vvp_darray_vec2(size, word_wid);
      } else if ((1 == sscanf(text, "sb%u%zn", &word_wid, &n)) &&
                 (n == strlen(text))) {
	    obj = new vvp_darray_vec2(size, word_wid);
      } else if ((1 == sscanf(text, "v%u%zn", &word_wid, &n)) &&
                 (n == strlen(text))) {
	    obj = new vvp_darray_vec4(size, word_wid);
      } else if ((1 == sscanf(text, "sv%u%zn", &word_wid, &n)) &&
                 (n == strlen(text))) {
	    obj = new vvp_darray_vec4(size, word_wid);
      } else if (strcmp(text,"r") == 0) {
	    obj = new vvp_darray_real(size);
      } else if (strcmp(text,"S") == 0) {
	    obj = new vvp_darray_string(size);
      } else {
	    cerr << "Internal error: Unsupported dynamic array type: "
	         << text << "." << endl;
	    assert(0);
      }

      thr->push_object(obj);

      return true;
}

bool of_NOOP(vthread_t, vvp_code_t)
{
      return true;
}

/*
 * %nor/r
 */
bool of_NORR(vthread_t thr, vvp_code_t)
{
      vvp_vector4_t val = thr->pop_vec4();

      vvp_bit4_t lb = BIT4_1;

      for (unsigned idx = 0 ;  idx < val.size() ;  idx += 1) {

	    vvp_bit4_t rb = val.value(idx);
	    if (rb == BIT4_1) {
		  lb = BIT4_0;
		  break;
	    }

	    if (rb != BIT4_0)
		  lb = BIT4_X;
      }

      vvp_vector4_t res (1, lb);
      thr->push_vec4(res);

      return true;
}

/*
 * Push a null to the object stack.
 */
bool of_NULL(vthread_t thr, vvp_code_t)
{
      vvp_object_t tmp;
      thr->push_object(tmp);
      return true;
}

/*
 * %and/r
 */
bool of_ANDR(vthread_t thr, vvp_code_t)
{
      vvp_vector4_t val = thr->pop_vec4();

      vvp_bit4_t lb = BIT4_1;

      for (unsigned idx = 0 ; idx < val.size() ; idx += 1) {
	    vvp_bit4_t rb = val.value(idx);
	    if (rb == BIT4_0) {
		  lb = BIT4_0;
		  break;
	    }

	    if (rb != 1)
		  lb = BIT4_X;
      }

      vvp_vector4_t res (1, lb);
      thr->push_vec4(res);

      return true;
}

/*
 * %nand/r
 */
bool of_NANDR(vthread_t thr, vvp_code_t)
{
      vvp_vector4_t val = thr->pop_vec4();

      vvp_bit4_t lb = BIT4_0;
      for (unsigned idx = 0 ; idx < val.size() ; idx += 1) {

	    vvp_bit4_t rb = val.value(idx);
	    if (rb == BIT4_0) {
		  lb = BIT4_1;
		  break;
	    }

	    if (rb != BIT4_1)
		  lb = BIT4_X;
      }

      vvp_vector4_t res (1, lb);
      thr->push_vec4(res);

      return true;
}

/*
 * %or/r
 */
bool of_ORR(vthread_t thr, vvp_code_t)
{
      vvp_vector4_t val = thr->pop_vec4();

      vvp_bit4_t lb = BIT4_0;
      for (unsigned idx = 0 ; idx < val.size() ; idx += 1) {
	    vvp_bit4_t rb = val.value(idx);
	    if (rb == BIT4_1) {
		  lb = BIT4_1;
		  break;
	    }

	    if (rb != BIT4_0)
		  lb = BIT4_X;
      }

      vvp_vector4_t res (1, lb);
      thr->push_vec4(res);
      return true;
}

/*
 * %xor/r
 */
bool of_XORR(vthread_t thr, vvp_code_t)
{
      vvp_vector4_t val = thr->pop_vec4();

      vvp_bit4_t lb = BIT4_0;
      for (unsigned idx = 0 ; idx < val.size() ; idx += 1) {

	    vvp_bit4_t rb = val.value(idx);
	    if (rb == BIT4_1)
		  lb = ~lb;
	    else if (rb != BIT4_0) {
		  lb = BIT4_X;
		  break;
	    }
      }

      vvp_vector4_t res (1, lb);
      thr->push_vec4(res);
      return true;
}

/*
 * %xnor/r
 */
bool of_XNORR(vthread_t thr, vvp_code_t)
{
      vvp_vector4_t val = thr->pop_vec4();

      vvp_bit4_t lb = BIT4_1;
      for (unsigned idx = 0 ; idx < val.size() ; idx += 1) {

	    vvp_bit4_t rb = val.value(idx);
	    if (rb == BIT4_1)
		  lb = ~lb;
	    else if (rb != BIT4_0) {
		  lb = BIT4_X;
		  break;
	    }
      }

      vvp_vector4_t res (1, lb);
      thr->push_vec4(res);
      return true;
}

/*
 * %or
 */
bool of_OR(vthread_t thr, vvp_code_t)
{
      vvp_vector4_t valb = thr->pop_vec4();
      vvp_vector4_t&vala = thr->peek_vec4();
      vala |= valb;
      return true;
}

/*
 * %nor
 */
bool of_NOR(vthread_t thr, vvp_code_t)
{
      vvp_vector4_t valr = thr->pop_vec4();
      vvp_vector4_t&vall = thr->peek_vec4();
      assert(vall.size() == valr.size());
      unsigned wid = vall.size();

      for (unsigned idx = 0 ; idx < wid ; idx += 1) {
	    vvp_bit4_t lb = vall.value(idx);
	    vvp_bit4_t rb = valr.value(idx);
	    vall.set_bit(idx, ~(lb|rb));
      }

      return true;
}

/*
 * %pop/obj <num>, <skip>
 */
bool of_POP_OBJ(vthread_t thr, vvp_code_t cp)
{
      unsigned cnt = cp->bit_idx[0];
      unsigned skip = cp->bit_idx[1];

      thr->pop_object(cnt, skip);
      return true;
}

/*
 * %pop/real <number>
 */
bool of_POP_REAL(vthread_t thr, vvp_code_t cp)
{
      unsigned cnt = cp->number;
      for (unsigned idx = 0 ; idx < cnt ; idx += 1) {
	    (void) thr->pop_real();
      }
      return true;
}

/*
 *  %pop/str <number>
 */
bool of_POP_STR(vthread_t thr, vvp_code_t cp)
{
      unsigned cnt = cp->number;
      thr->pop_str(cnt);
      return true;
}

/*
 *  %pop/vec4 <number>
 */
bool of_POP_VEC4(vthread_t thr, vvp_code_t cp)
{
      unsigned cnt = cp->number;
      thr->pop_vec4(cnt);
      return true;
}

/*
 * %pow
 * %pow/s
 */
static bool of_POW_base(vthread_t thr, bool signed_flag)
{
      vvp_vector4_t valb = thr->pop_vec4();
      vvp_vector4_t vala = thr->pop_vec4();

      unsigned wid = vala.size();

      vvp_vector2_t xv2 = vvp_vector2_t(vala, true);
      vvp_vector2_t yv2 = vvp_vector2_t(valb, true);


        /* If we have an X or Z in the arguments return X. */
      if (xv2.is_NaN() || yv2.is_NaN()) {
	    vvp_vector4_t tmp (wid, BIT4_X);
	    thr->push_vec4(tmp);
	    return true;
      }

	// Is the exponent negative? If so, table 5-6 in IEEE1364-2005
	// defines what value is returned.
      if (signed_flag && yv2.value(yv2.size()-1)) {
	    int a_val;
	    vvp_bit4_t pad = BIT4_0, lsb = BIT4_0;
	    if (vector2_to_value(xv2, a_val, true)) {
		  if (a_val == 0) {
			pad = BIT4_X; lsb = BIT4_X;
		  }
		  if (a_val == 1) {
			pad = BIT4_0; lsb = BIT4_1;
		  }
		  if (a_val == -1) {
			if (yv2.value(0)) {
			      pad = BIT4_1; lsb = BIT4_1;
			} else {
			      pad = BIT4_0; lsb = BIT4_1;
			}
		  }
	    }
	    vvp_vector4_t tmp (wid, pad);
	    tmp.set_bit(0, lsb);
	    thr->push_vec4(tmp);
	    return true;
      }

      vvp_vector2_t result = pow(xv2, yv2);

        /* Copy only what we need of the result. If the result is too
	   small, zero-pad it. */
      for (unsigned jdx = 0;  jdx < wid;  jdx += 1) {
	    if (jdx >= result.size())
		  vala.set_bit(jdx, BIT4_0);
	    else
		  vala.set_bit(jdx, result.value(jdx) ? BIT4_1 : BIT4_0);
      }
      thr->push_vec4(vala);

      return true;
}

bool of_POW(vthread_t thr, vvp_code_t)
{
      return of_POW_base(thr, false);
}

bool of_POW_S(vthread_t thr, vvp_code_t)
{
      return of_POW_base(thr, true);
}

bool of_POW_WR(vthread_t thr, vvp_code_t)
{
      double r = thr->pop_real();
      double l = thr->pop_real();
      thr->push_real(pow(l,r));

      return true;
}

/*
 * %prop/obj <pid>, <idx>
 *
 * Load an object value from the cobject and push it onto the object stack.
 */
bool of_PROP_OBJ(vthread_t thr, vvp_code_t cp)
{
      unsigned pid = cp->number;
      unsigned idx = cp->bit_idx[0];

      if (idx != 0) {
	    assert(idx < vthread_s::WORDS_COUNT);
	    idx = thr->words[idx].w_uint;
      }

      vvp_object_t&obj = thr->peek_object();
      vvp_cobject*cobj = obj.peek<vvp_cobject>();

      vvp_object_t val;
      cobj->get_object(pid, val, idx);

      thr->push_object(val);

      return true;
}

/*
 * %prop/r <pid>
 *
 * Load a real value from the cobject and push it onto the real value
 * stack.
 */
bool of_PROP_R(vthread_t thr, vvp_code_t cp)
{
      unsigned pid = cp->number;

      vvp_object_t&obj = thr->peek_object();
      vvp_cobject*cobj = obj.peek<vvp_cobject>();

      double val = cobj->get_real(pid);
      thr->push_real(val);

      return true;
}

/*
 * %prop/str <pid>
 *
 * Load a string value from the cobject and push it onto the real value
 * stack.
 */
bool of_PROP_STR(vthread_t thr, vvp_code_t cp)
{
      unsigned pid = cp->number;

      vvp_object_t&obj = thr->peek_object();
      vvp_cobject*cobj = obj.peek<vvp_cobject>();

      string val = cobj->get_string(pid);
      thr->push_str(val);

      return true;
}

/*
 * %prop/v <pid>
 *
 * Load a property <pid> from the cobject on the top of the stack into
 * the vector space at <base>.
 */
bool of_PROP_V(vthread_t thr, vvp_code_t cp)
{
      unsigned pid = cp->number;

      vvp_object_t&obj = thr->peek_object();
      vvp_cobject*cobj = obj.peek<vvp_cobject>();

      vvp_vector4_t val;
      cobj->get_vec4(pid, val);
      thr->push_vec4(val);

      return true;
}

bool of_PUSHI_REAL(vthread_t thr, vvp_code_t cp)
{
      double mant = cp->bit_idx[0];
      uint32_t imant = cp->bit_idx[0];
      int exp = cp->bit_idx[1];

	// Detect +infinity
      if (exp==0x3fff && imant==0) {
	    thr->push_real(INFINITY);
	    return true;
      }
	// Detect -infinity
      if (exp==0x7fff && imant==0) {
	    thr->push_real(-INFINITY);
	    return true;
      }
	// Detect NaN
      if (exp==0x3fff) {
	    thr->push_real(nan(""));
	    return true;
      }

      double sign = (exp & 0x4000)? -1.0 : 1.0;

      exp &= 0x1fff;

      mant = sign * ldexp(mant, exp - 0x1000);
      thr->push_real(mant);
      return true;
}

bool of_PUSHI_STR(vthread_t thr, vvp_code_t cp)
{
      const char*text = cp->text;
      thr->push_str(string(text));
      return true;
}

/*
 * %pushi/vec4 <vala>, <valb>, <wid>
 */
bool of_PUSHI_VEC4(vthread_t thr, vvp_code_t cp)
{
      uint32_t vala = cp->bit_idx[0];
      uint32_t valb = cp->bit_idx[1];
      unsigned wid  = cp->number;

	// I expect that most of the bits of an immediate value are
	// going to be zero, so start the result vector with all zero
	// bits. Then we only need to replace the bits that are different.
      vvp_vector4_t val (wid, BIT4_0);

	// Special case: Immediate zero is super easy.
      if (vala==0 && valb==0) {
	    thr->push_vec4(val);
	    return true;
      }

	// Special case: If the value is defined (no X or Z) and fits
	// in an unsigned long, then use the setarray method to write
	// the value all in one shot.
      if ((valb==0) && (wid <= 8*sizeof(unsigned long))) {
	    unsigned long tmp = vala;
	    val.setarray(0, wid, &tmp);
	    thr->push_vec4(val);
	    return true;
      }

	// The %pushi/vec4 can create values bigger then 32 bits, but
	// only if the high bits are zero. So at most we need to run
	// through the loop below 32 times. Maybe less, if the target
	// width is less. We don't have to do anything special on that
	// because vala/valb bits will shift away so (vala|valb) will
	// turn to zero at or before 32 shifts.

      for (unsigned idx = 0 ; idx < wid && (vala|valb) ; idx += 1) {
	    uint32_t ba = 0;
	      // Convert the vala/valb bits to a ba number that can be
	      // used to select what goes into the value.
	    ba = (valb & 1) << 1;
	    ba |= vala & 1;

	    switch (ba) {
		case 1:
		  val.set_bit(idx, BIT4_1);
		  break;
		case 2:
		  val.set_bit(idx, BIT4_Z);
		  break;
		case 3:
		  val.set_bit(idx, BIT4_X);
		  break;
		default:
		  break;
	    }

	    vala >>= 1;
	    valb >>= 1;
      }

      thr->push_vec4(val);

      return true;
}

/*
 * %pushv/str
 *   Pops a vec4 value, and pushes a string.
 */
bool of_PUSHV_STR(vthread_t thr, vvp_code_t)
{
      vvp_vector4_t vec = thr->pop_vec4();

      size_t slen = (vec.size() + 7)/8;
      vector<char>buf;
      buf.reserve(slen);

      for (size_t idx = 0 ; idx < vec.size() ; idx += 8) {
	    char tmp = 0;
	    size_t trans = 8;
	    if (idx+trans > vec.size())
		  trans = vec.size() - idx;

	    for (size_t bdx = 0 ; bdx < trans ; bdx += 1) {
		  if (vec.value(idx+bdx) == BIT4_1)
			tmp |= 1 << bdx;
	    }

	    if (tmp != 0)
		  buf.push_back(tmp);
      }

      string val;
      for (vector<char>::reverse_iterator cur = buf.rbegin()
		 ; cur != buf.rend() ; ++cur) {
	    val.push_back(*cur);
      }

      thr->push_str(val);

      return true;
}

/*
 * %putc/str/vec4 <var>, <mux>
 */
bool of_PUTC_STR_VEC4(vthread_t thr, vvp_code_t cp)
{
      unsigned muxr = cp->bit_idx[0];
      int32_t mux = muxr? thr->words[muxr].w_int : 0;

      vvp_vector4_t val = thr->pop_vec4();
      assert(val.size() == 8);

      if (mux < 0)
	    return true;

	/* Get the existing value of the string. If we find that the
	   index is too big for the string, then give up. */
      vvp_net_t*net = cp->net;
      vvp_fun_signal_string*fun = dynamic_cast<vvp_fun_signal_string*> (net->fun);
      assert(fun);

      string tmp = fun->get_string();
      if (tmp.size() <= (size_t)mux)
	    return true;

      char val_str = 0;
      for (size_t idx = 0 ; idx < 8 ; idx += 1) {
	    if (val.value(idx)==BIT4_1)
		  val_str |= 1<<idx;
      }

	// It is a quirk of the Verilog standard that putc(..., 'h00)
	// has no effect. Test for that case here.
      if (val_str == 0)
	    return true;

      tmp[mux] = val_str;

      vvp_send_string(vvp_net_ptr_t(cp->net, 0), tmp, thr->wt_context);
      return true;
}

bool of_QPOP_B_STR(vthread_t thr, vvp_code_t cp)
{
      vvp_net_t*net = cp->net;

      vvp_queue*dqueue = get_queue_object<vvp_queue_string>(thr, net);
      assert(dqueue);

      size_t size = dqueue->get_size();
      assert(size > 0);

      string value;
      dqueue->get_word(size-1, value);
      dqueue->pop_back();

      thr->push_str(value);
      return true;
}

/*
 * %qpop/b/v <var-label>
 */
bool of_QPOP_B_V(vthread_t thr, vvp_code_t cp)
{
      vvp_net_t*net = cp->net;

      vvp_queue*dqueue = get_queue_object<vvp_queue_vec4>(thr, net);
      assert(dqueue);

      size_t size = dqueue->get_size();
      assert(size > 0);

      vvp_vector4_t value;
      dqueue->get_word(size-1, value);
      dqueue->pop_back();

      thr->push_vec4(value);
      return true;
}

bool of_QPOP_F_STR(vthread_t thr, vvp_code_t cp)
{
      vvp_net_t*net = cp->net;

      vvp_queue*dqueue = get_queue_object<vvp_queue_string>(thr, net);
      assert(dqueue);

      string value;
      dqueue->get_word(0, value);
      dqueue->pop_front();

      thr->push_str(value);
      return true;
}

/*
 * %qpop/f/v <var-label>
 */
bool of_QPOP_F_V(vthread_t thr, vvp_code_t cp)
{
      vvp_net_t*net = cp->net;

      vvp_queue*dqueue = get_queue_object<vvp_queue_vec4>(thr, net);
      assert(dqueue);

      size_t size = dqueue->get_size();
      assert(size > 0);

      vvp_vector4_t value;
      dqueue->get_word(0, value);
      dqueue->pop_front();

      thr->push_vec4(value);
      return true;
}

/*
 * These implement the %release/net and %release/reg instructions. The
 * %release/net instruction applies to a net kind of functor by
 * sending the release/net command to the command port. (See vvp_net.h
 * for details.) The %release/reg instruction is the same, but sends
 * the release/reg command instead. These are very similar to the
 * %deassign instruction.
 */
static bool do_release_vec(vvp_code_t cp, bool net_flag)
{
      vvp_net_t*net = cp->net;
      unsigned base  = cp->bit_idx[0];
      unsigned width = cp->bit_idx[1];

      assert(net->fil);

      if (base >= net->fil->filter_size()) return true;
      if (base+width > net->fil->filter_size())
	    width = net->fil->filter_size() - base;

      bool full_sig = base == 0 && width == net->fil->filter_size();

	// XXXX Can't really do this if this is a partial release?
      net->fil->force_unlink();

	/* Do we release all or part of the net? */
      vvp_net_ptr_t ptr (net, 0);
      if (full_sig) {
	    net->fil->release(ptr, net_flag);
      } else {
	    net->fil->release_pv(ptr, base, width, net_flag);
      }
      net->fun->force_flag(false);

      return true;
}

bool of_RELEASE_NET(vthread_t, vvp_code_t cp)
{
      return do_release_vec(cp, true);
}


bool of_RELEASE_REG(vthread_t, vvp_code_t cp)
{
      return do_release_vec(cp, false);
}

/* The type is 1 for registers and 0 for everything else. */
bool of_RELEASE_WR(vthread_t, vvp_code_t cp)
{
      vvp_net_t*net = cp->net;
      unsigned type  = cp->bit_idx[0];

      assert(net->fil);
      net->fil->force_unlink();

	// Send a command to this signal to unforce itself.
      vvp_net_ptr_t ptr (net, 0);
      net->fil->release(ptr, type==0);
      return true;
}

bool of_REPLICATE(vthread_t thr, vvp_code_t cp)
{
      int rept = cp->number;
      vvp_vector4_t val = thr->pop_vec4();
      vvp_vector4_t res (val.size() * rept, BIT4_X);

      for (int idx = 0 ; idx < rept ; idx += 1) {
	    res.set_vec(idx * val.size(), val);
      }

      thr->push_vec4(res);

      return true;
}

/*
 * %ret/real <index>
 */
bool of_RET_REAL(vthread_t thr, vvp_code_t cp)
{
      size_t index = cp->number;
      double val = thr->pop_real();

      vthread_t fun_thr = thr;
      while (fun_thr->parent_scope->get_type_code() != vpiFunction) {
	    assert(fun_thr->parent);
	    fun_thr = fun_thr->parent;
      }
      assert(index < fun_thr->args_real.size());
      unsigned depth = fun_thr->args_real[index];
	// Use the depth to put the value into the stack of
	// the parent thread.
      fun_thr->parent->poke_real(depth, val);
      return true;
}

/*
 * %ret/str <index>
 */
bool of_RET_STR(vthread_t thr, vvp_code_t cp)
{
      size_t index = cp->number;
      string val = thr->pop_str();

      vthread_t fun_thr = thr;
      while (fun_thr->parent_scope->get_type_code() != vpiFunction) {
	    assert(fun_thr->parent);
	    fun_thr = fun_thr->parent;
      }
      assert(index < fun_thr->args_str.size());
      unsigned depth = fun_thr->args_str[index];
	// Use the depth to put the value into the stack of
	// the parent thread.
      fun_thr->parent->poke_str(depth, val);
      return true;
}

/*
 * %ret/vec4 <index>, <offset>, <wid>
 */
bool of_RET_VEC4(vthread_t thr, vvp_code_t cp)
{
      size_t index = cp->number;
      unsigned off_index = cp->bit_idx[0];
      int wid = cp->bit_idx[1];

      vthread_t fun_thr = thr;
      while (fun_thr->parent_scope->get_type_code() != vpiFunction) {
	    assert(fun_thr->parent);
	    fun_thr = fun_thr->parent;
      }
      assert(index < fun_thr->args_vec4.size());
      unsigned depth = fun_thr->args_vec4[index];

      int off = off_index? thr->words[off_index].w_int : 0;
      const int sig_value_size = fun_thr->parent->peek_vec4(depth).size();

      vvp_vector4_t&val = thr->peek_vec4();
      unsigned val_size = val.size();

      if (off_index!=0 && thr->flags[4] == BIT4_1) {
	    thr->pop_vec4(1);
	    return true;
      }

      if (off <= -wid) {
	    thr->pop_vec4(1);
	    return true;
      }

      if (off >= sig_value_size) {
	    thr->pop_vec4(1);
	    return true;
      }

	// If the index is below the vector, then only assign the high
	// bits that overlap with the target
      if (off < 0) {
	    int use_off = -off;
	    wid -= use_off;
	    val = val.subvalue(use_off, wid);
	    val_size = wid;
	    off = 0;
      }

	// If the value is partly above the target, then only assign
	// the bits that overlap
      if ((off+wid) > sig_value_size) {
	    wid = sig_value_size - off;
	    val = val.subvalue(0, wid);
	    val.resize(wid);
	    val_size = wid;
      }

      if (off==0 && val_size==(unsigned)sig_value_size) {
	    fun_thr->parent->poke_vec4(depth, val);

      } else {
	    vvp_vector4_t tmp_dst = fun_thr->parent->peek_vec4(depth);
	    assert(wid>=0 && val.size() == (unsigned)wid);
	    tmp_dst.set_vec(off, val);
	    fun_thr->parent->poke_vec4(depth, tmp_dst);
      }

      thr->pop_vec4(1);
      return true;
}

/*
 * %retload/real <index>
 */
bool of_RETLOAD_REAL(vthread_t thr, vvp_code_t cp)
{
      size_t index = cp->number;


      vthread_t fun_thr = thr;
      while (fun_thr->parent_scope->get_type_code() != vpiFunction) {
	    assert(fun_thr->parent);
	    fun_thr = fun_thr->parent;
      }
      assert(index < fun_thr->args_real.size());
      unsigned depth = fun_thr->args_real[index];
	// Use the depth to extract the values from the stack
	// of the parent thread.
      thr->push_real(fun_thr->parent->peek_real(depth));
      return true;
}

/*
 * %retload/str <index>
 */
bool of_RETLOAD_STR(vthread_t thr, vvp_code_t cp)
{
      size_t index = cp->number;


      vthread_t fun_thr = thr;
      while (fun_thr->parent_scope->get_type_code() != vpiFunction) {
	    assert(fun_thr->parent);
	    fun_thr = fun_thr->parent;
      }
      assert(index < fun_thr->args_str.size());
      unsigned depth = fun_thr->args_str[index];
	// Use the depth to extract the values from the stack
	// of the parent thread.
      thr->push_str(fun_thr->parent->peek_str(depth));
      return true;
}

/*
 * %retload/vec4 <index>
 */
bool of_RETLOAD_VEC4(vthread_t thr, vvp_code_t cp)
{
      size_t index = cp->number;

      vthread_t fun_thr = thr;
      while (fun_thr->parent_scope->get_type_code() != vpiFunction) {
	    assert(fun_thr->parent);
	    fun_thr = fun_thr->parent;
      }
      assert(index < fun_thr->args_vec4.size());
      unsigned depth = fun_thr->args_vec4[index];
	// Use the depth to put the value into the stack of
	// the parent thread.
      thr->push_vec4(fun_thr->parent->peek_vec4(depth));
      return true;
}

bool of_SCOPY(vthread_t thr, vvp_code_t)
{
      vvp_object_t tmp;
      thr->pop_object(tmp);

      vvp_object_t&dest = thr->peek_object();
      dest.shallow_copy(tmp);

      return true;
}

/*
 * %set/dar/obj/real <index>
 */
bool of_SET_DAR_OBJ_REAL(vthread_t thr, vvp_code_t cp)
{
      unsigned adr = thr->words[cp->number].w_int;

      double value = thr->peek_real(0);

      vvp_object_t&top = thr->peek_object();
      vvp_darray*darray = top.peek<vvp_darray>();
      assert(darray);

      darray->set_word(adr, value);
      return true;
}

/*
 * %set/dar/obj/str <index>
 */
bool of_SET_DAR_OBJ_VEC4(vthread_t thr, vvp_code_t cp)
{
      unsigned adr = thr->words[cp->number].w_int;

      vvp_vector4_t value = thr->peek_vec4(0);

      vvp_object_t&top = thr->peek_object();
      vvp_darray*darray = top.peek<vvp_darray>();
      assert(darray);

      darray->set_word(adr, value);
      return true;
}

/*
 * %set/dar/obj/str <index>
 */
bool of_SET_DAR_OBJ_STR(vthread_t thr, vvp_code_t cp)
{
      unsigned adr = thr->words[cp->number].w_int;

      string value = thr->peek_str(0);

      vvp_object_t&top = thr->peek_object();
      vvp_darray*darray = top.peek<vvp_darray>();
      assert(darray);

      darray->set_word(adr, value);
      return true;
}

/*
 * %shiftl <idx>
 *
 * Pop the operand, then push the result.
 */
bool of_SHIFTL(vthread_t thr, vvp_code_t cp)
{
      int use_index = cp->number;
      int shift = thr->words[use_index].w_int;

      vvp_vector4_t&val = thr->peek_vec4();
      int wid  = val.size();

      if (thr->flags[4] == BIT4_1) {
	      // The result is 'bx if the shift amount is undefined
	    val = vvp_vector4_t(wid, BIT4_X);

      } else if (shift >= wid) {
	      // Shift is so big that all value is shifted out. Write
	      // a constant 0 result.
	    val = vvp_vector4_t(wid, BIT4_0);

      } else if (shift > 0) {
	    vvp_vector4_t blk = val.subvalue(0, wid-shift);
	    vvp_vector4_t tmp (shift, BIT4_0);
	    val.set_vec(0, tmp);
	    val.set_vec(shift, blk);

      } else if (shift < -wid) {
	    val = vvp_vector4_t(wid, BIT4_X);

      } else if (shift < 0) {
	      // Negative left shift is a right shift.
	      // For a negative shift, we pad with 'bx.
	    int use_shift = -shift;
	    vvp_vector4_t blk = val.subvalue(use_shift, wid-use_shift);
	    vvp_vector4_t tmp (use_shift, BIT4_X);
	    val.set_vec(0, blk);
	    val.set_vec(wid-use_shift, tmp);
      }

      return true;
}

/*
 * %shiftr <idx>
 * This is an unsigned right shift. The <idx> is a number that selects
 * the index register with the amount of the shift. This instruction
 * checks flag bit 4, which will be true if the shift is invalid.
 */
bool of_SHIFTR(vthread_t thr, vvp_code_t cp)
{
      int use_index = cp->number;
      int shift = thr->words[use_index].w_int;

      vvp_vector4_t val = thr->pop_vec4();
      int wid  = val.size();

      if (thr->flags[4] == BIT4_1) {
	    val = vvp_vector4_t(wid, BIT4_X);

      } else if (shift > wid) {
	    val = vvp_vector4_t(wid, BIT4_0);

      } else if (shift > 0) {
	    vvp_vector4_t blk = val.subvalue(shift, wid-shift);
	    vvp_vector4_t tmp (shift, BIT4_0);
	    val.set_vec(0, blk);
	    val.set_vec(wid-shift, tmp);

      } else if (shift < -wid) {
	    val = vvp_vector4_t(wid, BIT4_X);

      } else if (shift < 0) {
	      // Negative right shift is a left shift.
	      // For a negative shift, we pad with 'bx.
	    int use_shift = -shift;
	    vvp_vector4_t blk = val.subvalue(0, wid-use_shift);
	    vvp_vector4_t tmp (use_shift, BIT4_X);
	    val.set_vec(0, tmp);
	    val.set_vec(use_shift, blk);
      }

      thr->push_vec4(val);
      return true;
}

/*
 *  %shiftr/s <wid>
 */
bool of_SHIFTR_S(vthread_t thr, vvp_code_t cp)
{
      int use_index = cp->number;
      int shift = thr->words[use_index].w_int;

      vvp_vector4_t val = thr->pop_vec4();
      int wid  = val.size();

      vvp_bit4_t sign_bit = val.value(val.size()-1);

      if (thr->flags[4] == BIT4_1) {
	    val = vvp_vector4_t(wid, BIT4_X);

      } else if (shift > wid) {
	    val = vvp_vector4_t(wid, sign_bit);

      } else if (shift > 0) {
	    vvp_vector4_t blk = val.subvalue(shift, wid-shift);
	    vvp_vector4_t tmp (shift, sign_bit);
	    val.set_vec(0, blk);
	    val.set_vec(wid-shift, tmp);

      } else if (shift < -wid) {
	    val = vvp_vector4_t(wid, BIT4_X);

      } else if (shift < 0) {
	    int use_shift = -shift;
	    vvp_vector4_t blk = val.subvalue(0, wid-use_shift);
	    vvp_vector4_t tmp(use_shift, BIT4_X);
	    val.set_vec(0, tmp);
	    val.set_vec(use_shift, blk);
      }

      thr->push_vec4(val);
      return true;
}

/*
 * %split/vec4 <wid>
 *   Pop 1 value,
 *   Take <wid> bits from the lsb,
 *   Push the remaining msb,
 *   Push the lsb.
 */
bool of_SPLIT_VEC4(vthread_t thr, vvp_code_t cp)
{
      unsigned lsb_wid = cp->number;

      vvp_vector4_t&val = thr->peek_vec4();
      assert(lsb_wid < val.size());

      vvp_vector4_t lsb = val.subvalue(0, lsb_wid);
      val = val.subvalue(lsb_wid, val.size()-lsb_wid);

      thr->push_vec4(lsb);
      return true;
}

bool of_STORE_DAR_R(vthread_t thr, vvp_code_t cp)
{
      long adr = thr->words[3].w_int;

	// Pop the real value to be store...
      double value = thr->pop_real();

      vvp_net_t*net = cp->net;
      vvp_fun_signal_object*obj = dynamic_cast<vvp_fun_signal_object*> (net->fun);
      assert(obj);

      vvp_darray*darray = obj->get_object().peek<vvp_darray>();
      assert(darray);

      darray->set_word(adr, value);
      return true;
}

/*
 * %store/dar/str <var>
 * In this case, <var> is the name of a dynamic array. Signed index
 * register 3 contains the index into the dynamic array.
 */
bool of_STORE_DAR_STR(vthread_t thr, vvp_code_t cp)
{
      long adr = thr->words[3].w_int;

	// Pop the string to be stored...
      string value = thr->pop_str();

      vvp_net_t*net = cp->net;
      vvp_fun_signal_object*obj = dynamic_cast<vvp_fun_signal_object*> (net->fun);
      assert(obj);

      vvp_darray*darray = obj->get_object().peek<vvp_darray>();
      assert(darray);

      darray->set_word(adr, value);
      return true;
}


bool of_STORE_DAR_VEC4(vthread_t thr, vvp_code_t cp)
{
      long adr = thr->words[3].w_int;

	// Pop the real value to be store...
      vvp_vector4_t value = thr->pop_vec4();

      vvp_net_t*net = cp->net;
      vvp_fun_signal_object*obj = dynamic_cast<vvp_fun_signal_object*> (net->fun);
      assert(obj);

      vvp_darray*darray = obj->get_object().peek<vvp_darray>();
      assert(darray);

      darray->set_word(adr, value);
      return true;
}

bool of_STORE_OBJ(vthread_t thr, vvp_code_t cp)
{
	/* set the value into port 0 of the destination. */
      vvp_net_ptr_t ptr (cp->net, 0);

      vvp_object_t val;
      thr->pop_object(val);

      vvp_send_object(ptr, val, thr->wt_context);

      return true;
}

/*
 * %store/obja <array-label> <index>
 */
bool of_STORE_OBJA(vthread_t thr, vvp_code_t cp)
{
      unsigned idx = cp->bit_idx[0];
      unsigned adr = thr->words[idx].w_int;

      vvp_object_t val;
      thr->pop_object(val);

      cp->array->set_word(adr, val);

      return true;
}


/*
 * %store/prop/obj <pid>, <idx>
 *
 * Pop an object value from the object stack, and store the value into
 * the property of the object references by the top of the stack. Do NOT
 * pop the object stack.
 */
bool of_STORE_PROP_OBJ(vthread_t thr, vvp_code_t cp)
{
      size_t pid = cp->number;
      unsigned idx = cp->bit_idx[0];

      if (idx != 0) {
	    assert(idx < vthread_s::WORDS_COUNT);
	    idx = thr->words[idx].w_uint;
      }

      vvp_object_t val;
      thr->pop_object(val);

      vvp_object_t&obj = thr->peek_object();
      vvp_cobject*cobj = obj.peek<vvp_cobject>();
      assert(cobj);

      cobj->set_object(pid, val, idx);

      return true;
}

/*
 * %store/prop/r <id>
 *
 * Pop a real value from the real stack, and store the value into the
 * property of the object references by the top of the stack. Do NOT
 * pop the object stack.
 */
bool of_STORE_PROP_R(vthread_t thr, vvp_code_t cp)
{
      size_t pid = cp->number;
      double val = thr->pop_real();

      vvp_object_t&obj = thr->peek_object();
      vvp_cobject*cobj = obj.peek<vvp_cobject>();
      assert(cobj);

      cobj->set_real(pid, val);

      return true;
}

/*
 * %store/prop/str <id>
 *
 * Pop a string value from the string stack, and store the value into
 * the property of the object references by the top of the stack. Do NOT
 * pop the object stack.
 */
bool of_STORE_PROP_STR(vthread_t thr, vvp_code_t cp)
{
      size_t pid = cp->number;
      string val = thr->pop_str();

      vvp_object_t&obj = thr->peek_object();
      vvp_cobject*cobj = obj.peek<vvp_cobject>();
      assert(cobj);

      cobj->set_string(pid, val);

      return true;
}

/*
 * %store/prop/v <pid>, <wid>
 *
 * Store vector value into property <id> of cobject in the top of the
 * stack. Do NOT pop the object stack.
 */
bool of_STORE_PROP_V(vthread_t thr, vvp_code_t cp)
{
      size_t pid = cp->number;
      unsigned wid = cp->bit_idx[0];

      vvp_vector4_t val = thr->pop_vec4();

      assert(val.size() >= wid);
      val.resize(wid);

      vvp_object_t&obj = thr->peek_object();
      vvp_cobject*cobj = obj.peek<vvp_cobject>();
      assert(cobj);

      cobj->set_vec4(pid, val);
      return true;
}

/*
 * %store/qb/r <var-label>
 */
bool of_STORE_QB_R(vthread_t, vvp_code_t)
{
      fprintf(stderr, "XXXX %%store/qb/r NOT IMPLEMENTED\n");
      return true;
}

/*
 * %store/qb/str <var-label>
 */
bool of_STORE_QB_STR(vthread_t thr, vvp_code_t cp)
{
	// Pop the string to be stored...
      string value = thr->pop_str();

      vvp_net_t*net = cp->net;
      vvp_queue*dqueue = get_queue_object<vvp_queue_string>(thr, net);

      assert(dqueue);
      dqueue->push_back(value);
      return true;
}

/*
 * %store/qb/v <var-label>, <wid>
 */
bool of_STORE_QB_V(vthread_t thr, vvp_code_t cp)
{
	// Pop the vec4 value to be stored...
      vvp_vector4_t value = thr->pop_vec4();

      vvp_net_t*net = cp->net;
      unsigned wid = cp->bit_idx[0];

      assert(value.size() == wid);

      vvp_queue*dqueue = get_queue_object<vvp_queue_vec4>(thr, net);

      assert(dqueue);
      dqueue->push_back(value);
      return true;
}


/*
 * %store/qf/r <var-label>
 */
bool of_STORE_QF_R(vthread_t, vvp_code_t)
{
      fprintf(stderr, "XXXX %%store/qf/r NOT IMPLEMENTED\n");
      return true;
}

/*
 * %store/qf/str <var-label>
 */
bool of_STORE_QF_STR(vthread_t, vvp_code_t)
{
      fprintf(stderr, "XXXX %%store/qf/str NOT IMPLEMENTED\n");
      return true;
}

/*
 * %store/qb/v <var-label>, <wid>
 */
bool of_STORE_QF_V(vthread_t thr, vvp_code_t cp)
{
	// Pop the vec4 value to be stored...
      vvp_vector4_t value = thr->pop_vec4();

      vvp_net_t*net = cp->net;
      unsigned wid = cp->bit_idx[0];

      vvp_queue*dqueue = get_queue_object<vvp_queue_vec4>(thr, net);

      assert(value.size() == wid);
      assert(dqueue);
      dqueue->push_front(value);
      return true;
}

bool of_STORE_REAL(vthread_t thr, vvp_code_t cp)
{
      double val = thr->pop_real();
	/* set the value into port 0 of the destination. */
      vvp_net_ptr_t ptr (cp->net, 0);
      vvp_send_real(ptr, val, thr->wt_context);

      return true;
}

/*
 * %store/reala <var-label> <index>
 */
bool of_STORE_REALA(vthread_t thr, vvp_code_t cp)
{
      unsigned idx = cp->bit_idx[0];
      unsigned adr = thr->words[idx].w_int;

      double val = thr->pop_real();
      cp->array->set_word(adr, val);

      return true;
}

bool of_STORE_STR(vthread_t thr, vvp_code_t cp)
{
	/* set the value into port 0 of the destination. */
      vvp_net_ptr_t ptr (cp->net, 0);

      string val = thr->pop_str();
      vvp_send_string(ptr, val, thr->wt_context);

      return true;
}

/*
 * %store/stra <array-label> <index>
 */
bool of_STORE_STRA(vthread_t thr, vvp_code_t cp)
{
      unsigned idx = cp->bit_idx[0];
      unsigned adr = thr->words[idx].w_int;

      string val = thr->pop_str();
      cp->array->set_word(adr, val);

      return true;
}

/*
 * %store/vec4 <var-label>, <offset>, <wid>
 *
 * <offset> is the index register that contains the base offset into
 * the destination. If zero, the offset of 0 is used instead of index
 * register zero. The offset value is SIGNED, and can be negative.
 *
 * <wid> is the actual width, an unsigned number.
 *
 * This function tests flag bit 4. If that flag is set, and <offset>
 * is an actual index register (not zero) then this assumes that the
 * calculation of the <offset> contents failed, and the store is
 * aborted.
 *
 * NOTE: This instruction may loose the <wid> argument because it is
 * not consistent with the %store/vec4/<etc> instructions which have
 * no <wid>.
 */
bool of_STORE_VEC4(vthread_t thr, vvp_code_t cp)
{
      vvp_net_ptr_t ptr(cp->net, 0);
      vvp_signal_value*sig = dynamic_cast<vvp_signal_value*> (cp->net->fil);
      unsigned off_index = cp->bit_idx[0];
      int wid = cp->bit_idx[1];

      int off = off_index? thr->words[off_index].w_int : 0;
      const int sig_value_size = sig->value_size();

      vvp_vector4_t&val = thr->peek_vec4();
      unsigned val_size = val.size();

      if ((int)val_size < wid) {
	    cerr << "XXXX Internal error: val.size()=" << val_size
		 << ", expecting >= " << wid << endl;
      }
      assert((int)val_size >= wid);
      if ((int)val_size > wid) {
	    val.resize(wid);
	    val_size = wid;
      }

	// If there is a problem loading the index register, flags-4
	// will be set to 1, and we know here to skip the actual assignment.
      if (off_index!=0 && thr->flags[4] == BIT4_1) {
	    thr->pop_vec4(1);
	    return true;
      }

      if (off <= -wid) {
	    thr->pop_vec4(1);
	    return true;
      }
      if (off >= sig_value_size) {
	    thr->pop_vec4(1);
	    return true;
      }

	// If the index is below the vector, then only assign the high
	// bits that overlap with the target.
      if (off < 0) {
	    int use_off = -off;
	    wid -= use_off;
	    val = val.subvalue(use_off, wid);
	    val_size = wid;
	    off = 0;
      }

	// If the value is partly above the target, then only assign
	// the bits that overlap.
      if ((off+wid) > sig_value_size) {
	    wid = sig_value_size - off;
	    val = val.subvalue(0, wid);
	    val.resize(wid);
	    val_size = wid;
      }


      if (off==0 && val_size==(unsigned)sig_value_size)
	    vvp_send_vec4(ptr, val, thr->wt_context);
      else
	    vvp_send_vec4_pv(ptr, val, off, wid, sig_value_size, thr->wt_context);

      thr->pop_vec4(1);
      return true;
}

/*
 * %store/vec4a <var-label>, <addr>, <offset>
 */
bool of_STORE_VEC4A(vthread_t thr, vvp_code_t cp)
{
      unsigned adr_index = cp->bit_idx[0];
      unsigned off_index = cp->bit_idx[1];

      vvp_vector4_t&value = thr->peek_vec4();

      long adr = adr_index? thr->words[adr_index].w_int : 0;
      long off = off_index? thr->words[off_index].w_int : 0;

	// Suppress action if flags-4 is true.
      if (thr->flags[4] == BIT4_1) {
	    thr->pop_vec4(1);
	    return true;
      }

      cp->array->set_word(adr, off, value);

      thr->pop_vec4(1);
      return true;
}

/*
 * %sub
 *   pop r;
 *   pop l;
 *   push l-r;
 */
bool of_SUB(vthread_t thr, vvp_code_t)
{
      vvp_vector4_t r = thr->pop_vec4();
      vvp_vector4_t&l = thr->peek_vec4();

      l.sub(r);
      return true;
}

/*
 * %subi <vala>, <valb>, <wid>
 *
 * Pop1 operand, get the other operand from the arguments, and push
 * the result.
 */
bool of_SUBI(vthread_t thr, vvp_code_t cp)
{
      unsigned wid = cp->number;

      vvp_vector4_t&l = thr->peek_vec4();

	// I expect that most of the bits of an immediate value are
	// going to be zero, so start the result vector with all zero
	// bits. Then we only need to replace the bits that are different.
      vvp_vector4_t r (wid, BIT4_0);
      get_immediate_rval (cp, r);

      l.sub(r);

      return true;

}

bool of_SUB_WR(vthread_t thr, vvp_code_t)
{
      double r = thr->pop_real();
      double l = thr->pop_real();
      thr->push_real(l - r);
      return true;
}

/*
 * %substr <first>, <last>
 * Pop a string, take the substring (SystemVerilog style), and return
 * the result to the stack. This opcode actually works by editing the
 * string in place.
 */
bool of_SUBSTR(vthread_t thr, vvp_code_t cp)
{
      int32_t first = thr->words[cp->bit_idx[0]].w_int;
      int32_t last = thr->words[cp->bit_idx[1]].w_int;
      string&val = thr->peek_str(0);

      if (first < 0 || last < first || last >= (int32_t)val.size()) {
	    val = string("");
	    return true;
      }

      val = val.substr(first, last-first+1);
      return true;
}

/*
 * %substr/vec4 <index>, <wid>
 */
bool of_SUBSTR_VEC4(vthread_t thr, vvp_code_t cp)
{
      unsigned sel_idx = cp->bit_idx[0];
      unsigned wid = cp->bit_idx[1];

      int32_t sel = thr->words[sel_idx].w_int;
      string&val = thr->peek_str(0);

      assert(wid%8 == 0);

      if (sel < 0 || sel >= (int32_t)val.size()) {
	    vvp_vector4_t res (wid, BIT4_0);
	    thr->push_vec4(res);
	    return true;
      }

      vvp_vector4_t res (wid, BIT4_0);

      assert(wid==8);
      unsigned char tmp = val[sel];
      for (int idx = 0 ; idx < 8 ; idx += 1) {
	    if (tmp & (1<<idx))
		  res.set_bit(idx, BIT4_1);
      }

      thr->push_vec4(res);
      return true;
}

bool of_FILE_LINE(vthread_t, vvp_code_t cp)
{
      if (show_file_line) {
	    vpiHandle handle = cp->handle;
	    cerr << vpi_get_str(vpiFile, handle) << ":"
	         << vpi_get(vpiLineNo, handle) << ": ";
	    cerr << vpi_get_str(_vpiDescription, handle);
	    cerr << endl;
      }
      return true;
}

/*
 * %test_nul <var-label>;
 * Test if the object at the specified variable is nil. If so, write
 * "1" into flags[4], otherwise write "0" into flags[4].
 */
bool of_TEST_NUL(vthread_t thr, vvp_code_t cp)
{
      vvp_net_t*net = cp->net;

      assert(net);
      vvp_fun_signal_object*obj = dynamic_cast<vvp_fun_signal_object*> (net->fun);
      assert(obj);

      if (obj->get_object().test_nil())
	    thr->flags[4] = BIT4_1;
      else
	    thr->flags[4] = BIT4_0;

      return true;
}

bool of_TEST_NUL_A(vthread_t thr, vvp_code_t cp)
{
      unsigned idx = cp->bit_idx[0];
      unsigned adr = thr->words[idx].w_int;
      vvp_object_t word;

	/* If the address is undefined, return true. */
      if (thr->flags[4] == BIT4_1) {
	    return true;
      }

      cp->array->get_word_obj(adr, word);
      if (word.test_nil())
	    thr->flags[4] = BIT4_1;
      else
	    thr->flags[4] = BIT4_0;

      return true;
}

bool of_TEST_NUL_OBJ(vthread_t thr, vvp_code_t)
{
      if (thr->peek_object().test_nil())
	    thr->flags[4] = BIT4_1;
      else
	    thr->flags[4] = BIT4_0;
      return true;
}

/*
 * %test_nul/prop <pid>, <idx>
 */
bool of_TEST_NUL_PROP(vthread_t thr, vvp_code_t cp)
{
      unsigned pid = cp->number;
      unsigned idx = cp->bit_idx[0];

      if (idx != 0) {
	    assert(idx < vthread_s::WORDS_COUNT);
	    idx = thr->words[idx].w_uint;
      }

      vvp_object_t&obj = thr->peek_object();
      vvp_cobject*cobj  = obj.peek<vvp_cobject>();

      vvp_object_t val;
      cobj->get_object(pid, val, idx);

      if (val.test_nil())
	    thr->flags[4] = BIT4_1;
      else
	    thr->flags[4] = BIT4_0;

      return true;
}

bool of_VPI_CALL(vthread_t thr, vvp_code_t cp)
{
      vpip_execute_vpi_call(thr, cp->handle);

      if (schedule_stopped()) {
	    if (! schedule_finished())
		  schedule_vthread(thr, 0, false);

	    return false;
      }

      return schedule_finished()? false : true;
}

/* %wait <label>;
 * Implement the wait by locating the vvp_net_T for the event, and
 * adding this thread to the threads list for the event. The some
 * argument is the  reference to the functor to wait for. This must be
 * an event object of some sort.
 */
bool of_WAIT(vthread_t thr, vvp_code_t cp)
{
      assert(! thr->i_am_in_function);
      assert(! thr->waiting_for_event);
      thr->waiting_for_event = 1;

	/* Add this thread to the list in the event. */
      waitable_hooks_s*ep = dynamic_cast<waitable_hooks_s*> (cp->net->fun);
      assert(ep);
      thr->wait_next = ep->add_waiting_thread(thr);

	/* Return false to suspend this thread. */
      return false;
}

/*
 * Implement the %wait/fork (SystemVerilog) instruction by suspending
 * the current thread until all the detached children have finished.
 */
bool of_WAIT_FORK(vthread_t thr, vvp_code_t)
{
	/* If a %wait/fork is being executed then the parent thread
	 * cannot be waiting in a join or already waiting. */
      assert(! thr->i_am_in_function);
      assert(! thr->i_am_joining);
      assert(! thr->i_am_waiting);

	/* There should be no active children when waiting. */
      assert(thr->children.empty());

	/* If there are no detached children then there is nothing to
	 * wait for. */
      if (thr->detached_children.empty()) return true;

	/* Flag that this process is waiting for the detached children
	 * to finish and suspend it. */
      thr->i_am_waiting = 1;
      return false;
}

/*
 * %xnor
 */
bool of_XNOR(vthread_t thr, vvp_code_t)
{
      vvp_vector4_t valr = thr->pop_vec4();
      vvp_vector4_t&vall = thr->peek_vec4();
      assert(vall.size() == valr.size());
      unsigned wid = vall.size();

      for (unsigned idx = 0 ;  idx < wid ;  idx += 1) {

	    vvp_bit4_t lb = vall.value(idx);
	    vvp_bit4_t rb = valr.value(idx);
	    vall.set_bit(idx, ~(lb ^ rb));
      }

      return true;
}

/*
 * %xor
 */
bool of_XOR(vthread_t thr, vvp_code_t)
{
      vvp_vector4_t valr = thr->pop_vec4();
      vvp_vector4_t&vall = thr->peek_vec4();
      assert(vall.size() == valr.size());
      unsigned wid = vall.size();

      for (unsigned idx = 0 ;  idx < wid ;  idx += 1) {

	    vvp_bit4_t lb = vall.value(idx);
	    vvp_bit4_t rb = valr.value(idx);
	    vall.set_bit(idx, lb ^ rb);
      }

      return true;
}


bool of_ZOMBIE(vthread_t thr, vvp_code_t)
{
      thr->pc = codespace_null();
      if ((thr->parent == 0) && (thr->children.empty())) {
	    if (thr->delay_delete)
		  schedule_del_thr(thr);
	    else
		  vthread_delete(thr);
      }
      return false;
}

/*
 * This is a phantom opcode used to call user defined functions. It
 * is used in code generated by the .ufunc statement. It contains a
 * pointer to the executable code of the function and a pointer to
 * a ufunc_core object that has all the port information about the
 * function.
 */
static bool do_exec_ufunc(vthread_t thr, vvp_code_t cp, vthread_t child)
{
      __vpiScope*child_scope = cp->ufunc_core_ptr->func_scope();
      assert(child_scope);

      assert(child_scope->get_type_code() == vpiFunction);
      assert(thr->children.empty());


        /* We can take a number of shortcuts because we know that a
           continuous assignment can only occur in a static scope. */
      assert(thr->wt_context == 0);
      assert(thr->rd_context == 0);

        /* If an automatic function, allocate a context for this call. */
      vvp_context_t child_context = 0;
      if (child_scope->is_automatic()) {
            child_context = vthread_alloc_context(child_scope);
            thr->wt_context = child_context;
            thr->rd_context = child_context;
      }

      child->wt_context = child_context;
      child->rd_context = child_context;

	/* Copy all the inputs to the ufunc object to the port
	   variables of the function. This copies all the values
	   atomically. */
      cp->ufunc_core_ptr->assign_bits_to_ports(child_context);
      child->delay_delete = 1;

      child->parent = thr;
      thr->children.insert(child);
	// This should be the only child
      assert(thr->children.size()==1);

      child->is_scheduled = 1;
      child->i_am_in_function = 1;
      vthread_run(child);
      running_thread = thr;

      assert(test_joinable(thr, child));

      if (child->i_have_ended) {
	    do_join(thr, child);
            return true;
      } else {
	    thr->i_am_joining = 1;
	    return false;
      }
}

bool of_EXEC_UFUNC_REAL(vthread_t thr, vvp_code_t cp)
{
      __vpiScope*child_scope = cp->ufunc_core_ptr->func_scope();
      assert(child_scope);

	/* Create a temporary thread and run it immediately. */
      vthread_t child = vthread_new(cp->cptr, child_scope);
      thr->push_real(0.0);
      child->args_real.push_back(0);

      return do_exec_ufunc(thr, cp, child);
}

bool of_EXEC_UFUNC_VEC4(vthread_t thr, vvp_code_t cp)
{
      __vpiScope*child_scope = cp->ufunc_core_ptr->func_scope();
      assert(child_scope);

      vpiScopeFunction*scope_func = dynamic_cast<vpiScopeFunction*>(child_scope);
      assert(scope_func);

	/* Create a temporary thread and run it immediately. */
      vthread_t child = vthread_new(cp->cptr, child_scope);
      thr->push_vec4(vvp_vector4_t(scope_func->get_func_width()));
      child->args_vec4.push_back(0);

      return do_exec_ufunc(thr, cp, child);
}

/*
 * This is a phantom opcode used to harvest the result of calling a user
 * defined function. It is used in code generated by the .ufunc statement.
 */
bool of_REAP_UFUNC(vthread_t thr, vvp_code_t cp)
{
      __vpiScope*child_scope = cp->ufunc_core_ptr->func_scope();
      assert(child_scope);

	/* Copy the output from the result variable to the output
	   ports of the .ufunc device. */
      cp->ufunc_core_ptr->finish_thread();

        /* If an automatic function, free the context for this call. */
      if (child_scope->is_automatic()) {
            vthread_free_context(thr->rd_context, child_scope);
            thr->wt_context = 0;
            thr->rd_context = 0;
      }

      return true;
}