iverilog/vvp/vthread.cc

/*
 * Copyright (c) 2001-2009 Stephen Williams (steve@icarus.com)
 *
 *    This source code is free software; you can redistribute it
 *    and/or modify it in source code form under the terms of the GNU
 *    General Public License as published by the Free Software
 *    Foundation; either version 2 of the License, or (at your option)
 *    any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
 */

# include  "config.h"
# include  "vthread.h"
# include  "codes.h"
# include  "schedule.h"
# include  "ufunc.h"
# include  "event.h"
# include  "vpi_priv.h"
# include  "vvp_net_sig.h"
#ifdef CHECK_WITH_VALGRIND
# include  "vvp_cleanup.h"
#endif
#ifdef HAVE_MALLOC_H
# include  <malloc.h>
#endif
# include  <typeinfo>
# include  <stdlib.h>
# include  <limits.h>
# include  <string.h>
# include  <math.h>
# include  <assert.h>

# include  <iostream>
#include  <stdio.h>

/* This is the size of an unsigned long in bits. This is just a
   convenience macro. */
# define CPU_WORD_BITS (8*sizeof(unsigned long))
# define TOP_BIT (1UL << (CPU_WORD_BITS-1))

/*
 * This vthread_s structure describes all there is to know about a
 * thread, including its program counter, all the private bits it
 * holds, and its place in other lists.
 *
 *
 * ** Notes On The Interactions of %fork/%join/%end:
 *
 * The %fork instruction creates a new thread and pushes that onto the
 * stack of children for the thread. This new thread, then, becomes
 * the new direct descendant of the thread. This new thread is
 * therefore also the first thread to be reaped when the parent does a
 * %join.
 *
 * It is a programming error for a thread that created threads to not
 * %join as many as it created before it %ends. The linear stack for
 * tracking thread relationships will create a mess otherwise. For
 * example, if A creates B then C, the stack is:
 *
 *       A --> C --> B
 *
 * If C then %forks X, the stack is:
 *
 *       A --> C --> X --> B
 *
 * If C %ends without a join, then the stack is:
 *
 *       A --> C(zombie) --> X --> B
 *
 * If A then executes 2 %joins, it will reap C and X (when it ends)
 * leaving B in purgatory. What's worse, A will block on the schedules
 * of X and C instead of C and B, possibly creating incorrect timing.
 *
 * The schedule_parent_on_end flag is used by threads to tell their
 * children that they are waiting for it to end. It is set by a %join
 * instruction if the child is not already done. The thread that
 * executes a %join instruction sets the flag in its child.
 *
 * The i_have_ended flag, on the other hand, is used by threads to
 * tell their parents that they are already dead. A thread that
 * executes %end will set its own i_have_ended flag and let its parent
 * reap it when the parent does the %join. If a thread has its
 * schedule_parent_on_end flag set already when it %ends, then it
 * reaps itself and simply schedules its parent. If a child has its
 * i_have_ended flag set when a thread executes %join, then it is free
 * to reap the child immediately.
 */

struct vthread_s {
	/* This is the program counter. */
      vvp_code_t pc;
	/* These hold the private thread bits. */
      vvp_vector4_t bits4;

	/* These are the word registers. */
      union {
	    int64_t  w_int;
	    uint64_t w_uint;
	    double   w_real;
      } words[16];

	/* My parent sets this when it wants me to wake it up. */
      unsigned schedule_parent_on_end :1;
      unsigned i_have_ended      :1;
      unsigned waiting_for_event :1;
      unsigned is_scheduled      :1;
      unsigned fork_count        :8;
	/* This points to the sole child of the thread. */
      struct vthread_s*child;
	/* This points to my parent, if I have one. */
      struct vthread_s*parent;
	/* This is used for keeping wait queues. */
      struct vthread_s*wait_next;
	/* These are used to keep the thread in a scope. */
      struct vthread_s*scope_next, *scope_prev;
	/* These are used to access automatically allocated items. */
      vvp_context_t wt_context, rd_context;
	/* These are used to pass non-blocking event control information. */
      vvp_net_t*event;
      uint64_t ecount;
};

struct vthread_s*running_thread = 0;

// this table maps the thread special index bit addresses to
// vvp_bit4_t bit values.
static vvp_bit4_t thr_index_to_bit4[4] = { BIT4_0, BIT4_1, BIT4_X, BIT4_Z };

static inline void thr_check_addr(struct vthread_s*thr, unsigned addr)
{
      if (thr->bits4.size() <= addr)
	    thr->bits4.resize(addr+1);
}

static inline vvp_bit4_t thr_get_bit(struct vthread_s*thr, unsigned addr)
{
      assert(addr < thr->bits4.size());
      return thr->bits4.value(addr);
}

static inline void thr_put_bit(struct vthread_s*thr,
			       unsigned addr, vvp_bit4_t val)
{
      thr_check_addr(thr, addr);
      thr->bits4.set_bit(addr, val);
}

// REMOVE ME
static inline void thr_clr_bit_(struct vthread_s*thr, unsigned addr)
{
      thr->bits4.set_bit(addr, BIT4_0);
}

vvp_bit4_t vthread_get_bit(struct vthread_s*thr, unsigned addr)
{
      return thr_get_bit(thr, addr);
}

void vthread_put_bit(struct vthread_s*thr, unsigned addr, vvp_bit4_t bit)
{
      thr_put_bit(thr, addr, bit);
}

double vthread_get_real(struct vthread_s*thr, unsigned addr)
{
      return thr->words[addr].w_real;
}

void vthread_put_real(struct vthread_s*thr, unsigned addr, double val)
{
      thr->words[addr].w_real = val;
}

template <class T> T coerce_to_width(const T&that, unsigned width)
{
      if (that.size() == width)
	    return that;

      assert(that.size() > width);
      T res (width);
      for (unsigned idx = 0 ;  idx < width ;  idx += 1)
	    res.set_bit(idx, that.value(idx));

      return res;
}

static unsigned long* vector_to_array(struct vthread_s*thr,
				      unsigned addr, unsigned wid)
{
      if (addr == 0) {
	    unsigned awid = (wid + CPU_WORD_BITS - 1) / (CPU_WORD_BITS);
	    unsigned long*val = new unsigned long[awid];
	    for (unsigned idx = 0 ;  idx < awid ;  idx += 1)
		  val[idx] = 0;
	    return val;
      }
      if (addr == 1) {
	    unsigned awid = (wid + CPU_WORD_BITS - 1) / (CPU_WORD_BITS);
	    unsigned long*val = new unsigned long[awid];
	    for (unsigned idx = 0 ;  idx < awid ;  idx += 1)
		  val[idx] = -1UL;

	    wid -= (awid-1) * CPU_WORD_BITS;
	    if (wid < CPU_WORD_BITS)
		  val[awid-1] &= (-1UL) >> (CPU_WORD_BITS-wid);

	    return val;
      }

      if (addr < 4)
	    return 0;

      return thr->bits4.subarray(addr, wid);
}

/*
 * This function gets from the thread a vector of bits starting from
 * the addressed location and for the specified width.
 */
static vvp_vector4_t vthread_bits_to_vector(struct vthread_s*thr,
					    unsigned bit, unsigned wid)
{
	/* Make a vector of the desired width. */

      if (bit >= 4) {
	    return vvp_vector4_t(thr->bits4, bit, wid);

      } else {
	    return vvp_vector4_t(wid, thr_index_to_bit4[bit]);
      }
}

/*
 * Some of the instructions do wide addition to arrays of long. They
 * use this add_with_cary function to help.
 */
static inline unsigned long add_with_carry(unsigned long a, unsigned long b,
					   unsigned long&carry)
{
      unsigned long tmp = b + carry;
      unsigned long sum = a + tmp;
      carry = 0;
      if (tmp < b)
	    carry = 1;
      if (sum < tmp)
	    carry = 1;
      if (sum < a)
	    carry = 1;
      return sum;
}

static unsigned long multiply_with_carry(unsigned long a, unsigned long b,
					 unsigned long&carry)
{
      const unsigned long mask = (1UL << (CPU_WORD_BITS/2)) - 1;
      unsigned long a0 = a & mask;
      unsigned long a1 = (a >> (CPU_WORD_BITS/2)) & mask;
      unsigned long b0 = b & mask;
      unsigned long b1 = (b >> (CPU_WORD_BITS/2)) & mask;

      unsigned long tmp = a0 * b0;

      unsigned long r00 = tmp & mask;
      unsigned long c00 = (tmp >> (CPU_WORD_BITS/2)) & mask;

      tmp = a0 * b1;

      unsigned long r01 = tmp & mask;
      unsigned long c01 = (tmp >> (CPU_WORD_BITS/2)) & mask;

      tmp = a1 * b0;

      unsigned long r10 = tmp & mask;
      unsigned long c10 = (tmp >> (CPU_WORD_BITS/2)) & mask;

      tmp = a1 * b1;

      unsigned long r11 = tmp & mask;
      unsigned long c11 = (tmp >> (CPU_WORD_BITS/2)) & mask;

      unsigned long r1 = c00 + r01 + r10;
      unsigned long r2 = (r1 >> (CPU_WORD_BITS/2)) & mask;
      r1 &= mask;
      r2 += c01 + c10 + r11;
      unsigned long r3 = (r2 >> (CPU_WORD_BITS/2)) & mask;
      r2 &= mask;
      r3 += c11;
      r3 &= mask;

      carry = (r3 << (CPU_WORD_BITS/2)) + r2;
      return (r1 << (CPU_WORD_BITS/2)) + r00;
}

static void multiply_array_imm(unsigned long*res, unsigned long*val,
			       unsigned words, unsigned long imm)
{
      for (unsigned idx = 0 ; idx < words ; idx += 1)
	    res[idx] = 0;

      for (unsigned mul_idx = 0 ; mul_idx < words ; mul_idx += 1) {
	    unsigned long sum;
	    unsigned long tmp = multiply_with_carry(val[mul_idx], imm, sum);

	    unsigned long carry = 0;
	    res[mul_idx] = add_with_carry(res[mul_idx], tmp, carry);
	    for (unsigned add_idx = mul_idx+1 ; add_idx < words ; add_idx += 1) {
		  res[add_idx] = add_with_carry(res[add_idx], sum, carry);
		  sum = 0;
	    }
      }
}

/*
 * Allocate a context for use by a child thread. By preference, use
 * the last freed context. If none available, create a new one. Add
 * it to the list of live contexts in that scope.
 */
static vvp_context_t vthread_alloc_context(struct __vpiScope*scope)
{
      assert(scope->is_automatic);

      vvp_context_t context = scope->free_contexts;
      if (context) {
            scope->free_contexts = vvp_get_next_context(context);
            for (unsigned idx = 0 ; idx < scope->nitem ; idx += 1) {
                  scope->item[idx]->reset_instance(context);
            }
      } else {
            context = vvp_allocate_context(scope->nitem);
            for (unsigned idx = 0 ; idx < scope->nitem ; idx += 1) {
                  scope->item[idx]->alloc_instance(context);
            }
      }

      vvp_set_next_context(context, scope->live_contexts);
      scope->live_contexts = context;

      return context;
}

/*
 * Free a context previously allocated to a child thread by pushing it
 * onto the freed context stack. Remove it from the list of live contexts
 * in that scope.
 */
static void vthread_free_context(vvp_context_t context, struct __vpiScope*scope)
{
      assert(scope->is_automatic);
      assert(context);

      if (context == scope->live_contexts) {
            scope->live_contexts = vvp_get_next_context(context);
      } else {
            vvp_context_t tmp = scope->live_contexts;
            while (context != vvp_get_next_context(tmp)) {
                  assert(tmp);
                  tmp = vvp_get_next_context(tmp);
            }
            vvp_set_next_context(tmp, vvp_get_next_context(context));
      }

      vvp_set_next_context(context, scope->free_contexts);
      scope->free_contexts = context;
}

#ifdef CHECK_WITH_VALGRIND
void contexts_delete(struct __vpiScope*scope)
{
      vvp_context_t context = scope->free_contexts;

      while (context) {
	    scope->free_contexts = vvp_get_next_context(context);
	    for (unsigned idx = 0; idx < scope->nitem; idx += 1) {
		  scope->item[idx]->free_instance(context);
	    }
	    free(context);
	    context = scope->free_contexts;
      }
      free(scope->item);
}
#endif

/*
 * Create a new thread with the given start address.
 */
vthread_t vthread_new(vvp_code_t pc, struct __vpiScope*scope)
{
      vthread_t thr = new struct vthread_s;
      thr->pc     = pc;
      thr->bits4  = vvp_vector4_t(32);
      thr->child  = 0;
      thr->parent = 0;
      thr->wait_next = 0;
      thr->wt_context = 0;
      thr->rd_context = 0;

	/* If the target scope never held a thread, then create a
	   header cell for it. This is a stub to make circular lists
	   easier to work with. */
      if (scope->threads == 0) {
	    scope->threads = new struct vthread_s;
	    scope->threads->pc = codespace_null();
	    scope->threads->bits4  = vvp_vector4_t();
	    scope->threads->child  = 0;
	    scope->threads->parent = 0;
	    scope->threads->scope_prev = scope->threads;
	    scope->threads->scope_next = scope->threads;
      }

      { vthread_t tmp = scope->threads;
        thr->scope_next = tmp->scope_next;
	thr->scope_prev = tmp;
	thr->scope_next->scope_prev = thr;
	thr->scope_prev->scope_next = thr;
      }

      thr->schedule_parent_on_end = 0;
      thr->is_scheduled = 0;
      thr->i_have_ended = 0;
      thr->waiting_for_event = 0;
      thr->fork_count   = 0;
      thr->event  = 0;
      thr->ecount = 0;

      thr_put_bit(thr, 0, BIT4_0);
      thr_put_bit(thr, 1, BIT4_1);
      thr_put_bit(thr, 2, BIT4_X);
      thr_put_bit(thr, 3, BIT4_Z);

      return thr;
}

#ifdef CHECK_WITH_VALGRIND
#if 0
/*
 * These are not currently correct. If you use them you will get
 * double delete messages. There is still a leak related to a
 * waiting event that needs to be investigated.
 */

static void wait_next_delete(vthread_t base)
{
      while (base) {
	    vthread_t tmp = base->wait_next;
	    delete base;
	    base = tmp;
	    if (base->waiting_for_event == 0) break;
      }
}

static void child_delete(vthread_t base)
{
      while (base) {
	    vthread_t tmp = base->child;
	    delete base;
	    base = tmp;
      }
}
#endif

void vthreads_delete(vthread_t base)
{
      if (base == 0) return;

      vthread_t cur = base->scope_next;
      while (base != cur) {
	    vthread_t tmp = cur->scope_next;
//	    if (cur->waiting_for_event) wait_next_delete(cur->wait_next);
//	    child_delete(cur->child);
	    delete cur;
	    cur = tmp;
      }
	/* This is a stub so does not have a wait_next queue. */
      delete base;
}
#endif

/*
 * Reaping pulls the thread out of the stack of threads. If I have a
 * child, then hand it over to my parent.
 */
static void vthread_reap(vthread_t thr)
{
      if (thr->child) {
	    assert(thr->child->parent == thr);
	    thr->child->parent = thr->parent;
      }
      if (thr->parent) {
	    assert(thr->parent->child == thr);
	    thr->parent->child = thr->child;
      }

      thr->child = 0;
      thr->parent = 0;

      thr->scope_next->scope_prev = thr->scope_prev;
      thr->scope_prev->scope_next = thr->scope_next;

      thr->pc = codespace_null();

	/* If this thread is not scheduled, then is it safe to delete
	   it now. Otherwise, let the schedule event (which will
	   execute the thread at of_ZOMBIE) delete the object. */
      if ((thr->is_scheduled == 0) && (thr->waiting_for_event == 0)) {
	    assert(thr->fork_count == 0);
	    assert(thr->wait_next == 0);
	    schedule_del_thr(thr);
      }
}

void vthread_delete(vthread_t thr)
{
      thr->bits4 = vvp_vector4_t();
      delete thr;
}

void vthread_mark_scheduled(vthread_t thr)
{
      while (thr != 0) {
	    assert(thr->is_scheduled == 0);
	    thr->is_scheduled = 1;
	    thr = thr->wait_next;
      }
}

/*
 * This function runs each thread by fetching an instruction,
 * incrementing the PC, and executing the instruction. The thread may
 * be the head of a list, so each thread is run so far as possible.
 */
void vthread_run(vthread_t thr)
{
      while (thr != 0) {
	    vthread_t tmp = thr->wait_next;
	    thr->wait_next = 0;

	    assert(thr->is_scheduled);
	    thr->is_scheduled = 0;

            running_thread = thr;

	    for (;;) {
		  vvp_code_t cp = thr->pc;
		  thr->pc += 1;

		    /* Run the opcode implementation. If the execution of
		       the opcode returns false, then the thread is meant to
		       be paused, so break out of the loop. */
		  bool rc = (cp->opcode)(thr, cp);
		  if (rc == false)
			break;
	    }

	    thr = tmp;
      }
      running_thread = 0;
}

/*
 * The CHUNK_LINK instruction is a special next pointer for linking
 * chunks of code space. It's like a simplified %jmp.
 */
bool of_CHUNK_LINK(vthread_t thr, vvp_code_t code)
{
      assert(code->cptr);
      thr->pc = code->cptr;
      return true;
}

/*
 * This is called by an event functor to wake up all the threads on
 * its list. I in fact created that list in the %wait instruction, and
 * I also am certain that the waiting_for_event flag is set.
 */
void vthread_schedule_list(vthread_t thr)
{
      for (vthread_t cur = thr ;  cur ;  cur = cur->wait_next) {
	    assert(cur->waiting_for_event);
	    cur->waiting_for_event = 0;
      }

      schedule_vthread(thr, 0);
}

vvp_context_t vthread_get_wt_context()
{
      if (running_thread)
            return running_thread->wt_context;
      else
            return 0;
}

vvp_context_t vthread_get_rd_context()
{
      if (running_thread)
            return running_thread->rd_context;
      else
            return 0;
}

vvp_context_item_t vthread_get_wt_context_item(unsigned context_idx)
{
      assert(running_thread && running_thread->wt_context);
      return vvp_get_context_item(running_thread->wt_context, context_idx);
}

vvp_context_item_t vthread_get_rd_context_item(unsigned context_idx)
{
      assert(running_thread && running_thread->rd_context);
      return vvp_get_context_item(running_thread->rd_context, context_idx);
}

bool of_ABS_WR(vthread_t thr, vvp_code_t cp)
{
      unsigned dst = cp->bit_idx[0];
      unsigned src = cp->bit_idx[1];

      thr->words[dst].w_real = fabs(thr->words[src].w_real);
      return true;
}

bool of_ALLOC(vthread_t thr, vvp_code_t cp)
{
        /* Allocate a context. */
      vvp_context_t child_context = vthread_alloc_context(cp->scope);

        /* Push the allocated context onto the write context stack. */
      vvp_set_stacked_context(child_context, thr->wt_context);
      thr->wt_context = child_context;

      return true;
}

static bool of_AND_wide(vthread_t thr, vvp_code_t cp)
{
      unsigned idx1 = cp->bit_idx[0];
      unsigned idx2 = cp->bit_idx[1];
      unsigned wid = cp->number;

      vvp_vector4_t val = vthread_bits_to_vector(thr, idx1, wid);
      val &= vthread_bits_to_vector(thr, idx2, wid);
      thr->bits4.set_vec(idx1, val);

      return true;
}

static bool of_AND_narrow(vthread_t thr, vvp_code_t cp)
{
      unsigned idx1 = cp->bit_idx[0];
      unsigned idx2 = cp->bit_idx[1];
      unsigned wid = cp->number;

      for (unsigned idx = 0 ; idx < wid ; idx += 1) {
	    vvp_bit4_t lb = thr_get_bit(thr, idx1);
	    vvp_bit4_t rb = thr_get_bit(thr, idx2);
	    thr_put_bit(thr, idx1, lb&rb);
	    idx1 += 1;
	    if (idx2 >= 4)
		  idx2 += 1;
      }

      return true;
}

bool of_AND(vthread_t thr, vvp_code_t cp)
{
      assert(cp->bit_idx[0] >= 4);

      if (cp->number <= 4)
	    cp->opcode = &of_AND_narrow;
      else
	    cp->opcode = &of_AND_wide;

      return cp->opcode(thr, cp);
}


bool of_ANDI(vthread_t thr, vvp_code_t cp)
{
      unsigned idx1 = cp->bit_idx[0];
      unsigned long imm = cp->bit_idx[1];
      unsigned wid = cp->number;

      assert(idx1 >= 4);

      vvp_vector4_t val = vthread_bits_to_vector(thr, idx1, wid);
      vvp_vector4_t imv (wid, BIT4_0);

      unsigned trans = wid;
      if (trans > CPU_WORD_BITS)
	    trans = CPU_WORD_BITS;
      imv.setarray(0, trans, &imm);

      val &= imv;

      thr->bits4.set_vec(idx1, val);
      return true;
}

bool of_ADD(vthread_t thr, vvp_code_t cp)
{
      assert(cp->bit_idx[0] >= 4);

      unsigned long*lva = vector_to_array(thr, cp->bit_idx[0], cp->number);
      unsigned long*lvb = vector_to_array(thr, cp->bit_idx[1], cp->number);
      if (lva == 0 || lvb == 0)
	    goto x_out;

      unsigned long carry;
      carry = 0;
      for (unsigned idx = 0 ;  (idx*CPU_WORD_BITS) < cp->number ;  idx += 1)
	    lva[idx] = add_with_carry(lva[idx], lvb[idx], carry);

	/* We know from the vector_to_array that the address is valid
	   in the thr->bitr4 vector, so just do the set bit. */

      thr->bits4.setarray(cp->bit_idx[0], cp->number, lva);

      delete[]lva;
      delete[]lvb;

      return true;

 x_out:
      delete[]lva;
      delete[]lvb;

      vvp_vector4_t tmp(cp->number, BIT4_X);
      thr->bits4.set_vec(cp->bit_idx[0], tmp);

      return true;
}

bool of_ADD_WR(vthread_t thr, vvp_code_t cp)
{
      double l = thr->words[cp->bit_idx[0]].w_real;
      double r = thr->words[cp->bit_idx[1]].w_real;
      thr->words[cp->bit_idx[0]].w_real = l + r;
      return true;
}

/*
 * This is %addi, add-immediate. The first value is a vector, the
 * second value is the immediate value in the bin_idx[1] position. The
 * immediate value can be up to 16 bits, which are then padded to the
 * width of the vector with zero.
 */
bool of_ADDI(vthread_t thr, vvp_code_t cp)
{
	// Collect arguments
      unsigned bit_addr       = cp->bit_idx[0];
      unsigned long imm_value = cp->bit_idx[1];
      unsigned bit_width      = cp->number;

      assert(bit_addr >= 4);

      unsigned word_count = (bit_width+CPU_WORD_BITS-1)/CPU_WORD_BITS;

      unsigned long*lva = vector_to_array(thr, bit_addr, bit_width);
      if (lva == 0)
	    goto x_out;


      unsigned long carry;
      carry = 0;
      for (unsigned idx = 0 ;  idx < word_count ;  idx += 1) {
	    lva[idx] = add_with_carry(lva[idx], imm_value, carry);
	    imm_value = 0;
      }

	/* We know from the vector_to_array that the address is valid
	   in the thr->bitr4 vector, so just do the set bit. */

      thr->bits4.setarray(bit_addr, bit_width, lva);

      delete[]lva;

      return true;

 x_out:
      delete[]lva;

      vvp_vector4_t tmp (bit_width, BIT4_X);
      thr->bits4.set_vec(bit_addr, tmp);

      return true;
}

/* %assign/ar <array>, <delay>, <bit>
 * Generate an assignment event to a real array. Index register 3
 * contains the canonical address of the word in the memory. <delay>
 * is the delay in simulation time. <bit> is the index register
 * containing the real value.
 */
bool of_ASSIGN_AR(vthread_t thr, vvp_code_t cp)
{
      long adr = thr->words[3].w_int;
      unsigned delay = cp->bit_idx[0];
      double value = thr->words[cp->bit_idx[1]].w_real;

      if (adr >= 0) {
	    schedule_assign_array_word(cp->array, adr, value, delay);
      }

      return true;
}

/* %assign/ar/d <array>, <delay_idx>, <bit>
 * Generate an assignment event to a real array. Index register 3
 * contains the canonical address of the word in the memory.
 * <delay_idx> is the integer register that contains the delay value.
 * <bit> is the index register containing the real value.
 */
bool of_ASSIGN_ARD(vthread_t thr, vvp_code_t cp)
{
      long adr = thr->words[3].w_int;
      vvp_time64_t delay = thr->words[cp->bit_idx[0]].w_int;
      double value = thr->words[cp->bit_idx[1]].w_real;

      if (adr >= 0) {
	    schedule_assign_array_word(cp->array, adr, value, delay);
      }

      return true;
}

/* %assign/ar/e <array>, <bit>
 * Generate an assignment event to a real array. Index register 3
 * contains the canonical address of the word in the memory. <bit>
 * is the index register containing the real value. The event
 * information is contained in the thread event control registers
 * and is set with %evctl.
 */
bool of_ASSIGN_ARE(vthread_t thr, vvp_code_t cp)
{
      long adr = thr->words[3].w_int;
      double value = thr->words[cp->bit_idx[0]].w_real;

      if (adr >= 0) {
	    if (thr->ecount == 0) {
		  schedule_assign_array_word(cp->array, adr, value, 0);
	    } else {
		  schedule_evctl(cp->array, adr, value, thr->event,
		                 thr->ecount);
	    }
      }

      return true;
}

/* %assign/av <array>, <delay>, <bit>
 * This generates an assignment event to an array. Index register 0
 * contains the width of the vector (and the word) and index register
 * 3 contains the canonical address of the word in memory.
 */
bool of_ASSIGN_AV(vthread_t thr, vvp_code_t cp)
{
      unsigned wid = thr->words[0].w_int;
      long off = thr->words[1].w_int;
      long adr = thr->words[3].w_int;
      unsigned delay = cp->bit_idx[0];
      unsigned bit = cp->bit_idx[1];

      if (adr < 0) return true;

      long vwidth = get_array_word_size(cp->array);
	// We fell off the MSB end.
      if (off >= vwidth) return true;
	// Trim the bits after the MSB
      if (off + (long)wid > vwidth) {
	    wid += vwidth - off - wid;
      } else if (off < 0 ) {
	      // We fell off the LSB end.
	    if ((unsigned)-off > wid ) return true;
	      // Trim the bits before the LSB
	    wid += off;
	    bit -= off;
	    off = 0;
      }

      assert(wid > 0);

      vvp_vector4_t value = vthread_bits_to_vector(thr, bit, wid);

      schedule_assign_array_word(cp->array, adr, off, value, delay);
      return true;
}

/* %assign/av/d <array>, <delay_idx>, <bit>
 * This generates an assignment event to an array. Index register 0
 * contains the width of the vector (and the word) and index register
 * 3 contains the canonical address of the word in memory. The named
 * index register contains the delay.
 */
bool of_ASSIGN_AVD(vthread_t thr, vvp_code_t cp)
{
      unsigned wid = thr->words[0].w_int;
      long off = thr->words[1].w_int;
      long adr = thr->words[3].w_int;
      vvp_time64_t delay = thr->words[cp->bit_idx[0]].w_int;
      unsigned bit = cp->bit_idx[1];

      if (adr < 0) return true;

      long vwidth = get_array_word_size(cp->array);
	// We fell off the MSB end.
      if (off >= vwidth) return true;
	// Trim the bits after the MSB
      if (off + (long)wid > vwidth) {
	    wid += vwidth - off - wid;
      } else if (off < 0 ) {
	      // We fell off the LSB end.
	    if ((unsigned)-off > wid ) return true;
	      // Trim the bits before the LSB
	    wid += off;
	    bit -= off;
	    off = 0;
      }

      assert(wid > 0);

      vvp_vector4_t value = vthread_bits_to_vector(thr, bit, wid);

      schedule_assign_array_word(cp->array, adr, off, value, delay);
      return true;
}

bool of_ASSIGN_AVE(vthread_t thr, vvp_code_t cp)
{
      unsigned wid = thr->words[0].w_int;
      long off = thr->words[1].w_int;
      long adr = thr->words[3].w_int;
      unsigned bit = cp->bit_idx[0];

      if (adr < 0) return true;

      long vwidth = get_array_word_size(cp->array);
	// We fell off the MSB end.
      if (off >= vwidth) return true;
	// Trim the bits after the MSB
      if (off + (long)wid > vwidth) {
	    wid += vwidth - off - wid;
      } else if (off < 0 ) {
	      // We fell off the LSB end.
	    if ((unsigned)-off > wid ) return true;
	      // Trim the bits before the LSB
	    wid += off;
	    bit -= off;
	    off = 0;
      }

      assert(wid > 0);

      vvp_vector4_t value = vthread_bits_to_vector(thr, bit, wid);
	// If the count is zero then just put the value.
      if (thr->ecount == 0) {
	    schedule_assign_array_word(cp->array, adr, off, value, 0);
      } else {
	    schedule_evctl(cp->array, adr, value, off, thr->event, thr->ecount);
      }
      return true;
}

/*
 * This is %assign/v0 <label>, <delay>, <bit>
 * Index register 0 contains a vector width.
 */
bool of_ASSIGN_V0(vthread_t thr, vvp_code_t cp)
{
      unsigned wid = thr->words[0].w_int;
      assert(wid > 0);
      unsigned delay = cp->bit_idx[0];
      unsigned bit = cp->bit_idx[1];

      vvp_net_ptr_t ptr (cp->net, 0);
      if (bit >= 4) {
	      // If the vector is not a synthetic one, then have the
	      // scheduler pluck it direcly out of my vector space.
	    schedule_assign_plucked_vector(ptr, delay, thr->bits4, bit, wid);
      } else {
	    vvp_vector4_t value = vthread_bits_to_vector(thr, bit, wid);
	    schedule_assign_plucked_vector(ptr, delay, value, 0, wid);
      }

      return true;
}

/*
 * This is %assign/v0/d <label>, <delay_idx>, <bit>
 * Index register 0 contains a vector width, and the named index
 * register contains the delay.
 */
bool of_ASSIGN_V0D(vthread_t thr, vvp_code_t cp)
{
      unsigned wid = thr->words[0].w_int;
      assert(wid > 0);

      vvp_time64_t delay = thr->words[cp->bit_idx[0]].w_int;
      unsigned bit = cp->bit_idx[1];

      vvp_net_ptr_t ptr (cp->net, 0);

      if (bit >= 4) {
	    schedule_assign_plucked_vector(ptr, delay, thr->bits4, bit, wid);
      } else {
	    vvp_vector4_t value = vthread_bits_to_vector(thr, bit, wid);
	    schedule_assign_plucked_vector(ptr, delay, value, 0, wid);
      }

      return true;
}

/*
 * This is %assign/v0/e <label>, <bit>
 * Index register 0 contains a vector width.
 */
bool of_ASSIGN_V0E(vthread_t thr, vvp_code_t cp)
{
      assert(thr->event != 0);
      unsigned wid = thr->words[0].w_int;
      assert(wid > 0);
      unsigned bit = cp->bit_idx[0];

      vvp_net_ptr_t ptr (cp->net, 0);

      vvp_vector4_t value = vthread_bits_to_vector(thr, bit, wid);
	// If the count is zero then just put the value.
      if (thr->ecount == 0) {
	    schedule_assign_plucked_vector(ptr, 0, value, 0, wid);
      } else {
	    schedule_evctl(ptr, value, 0, 0, thr->event, thr->ecount);
      }

      thr->event = 0;
      thr->ecount = 0;

      return true;
}

/*
 * This is %assign/v0/x1 <label>, <delay>, <bit>
 * Index register 0 contains a vector part width.
 * Index register 1 contains the offset into the destination vector.
 */
bool of_ASSIGN_V0X1(vthread_t thr, vvp_code_t cp)
{
      unsigned wid = thr->words[0].w_int;
      long off = thr->words[1].w_int;
      unsigned delay = cp->bit_idx[0];
      unsigned bit = cp->bit_idx[1];

      vvp_signal_value*sig = dynamic_cast<vvp_signal_value*> (cp->net->fil);
      assert(sig);

	// We fell off the MSB end.
      if (off >= (long)sig->value_size()) return true;
      else if (off < 0 ) {
	      // We fell off the LSB end.
	    if ((unsigned)-off >= wid ) return true;
	      // Trim the bits before the LSB
	    wid += off;
	    bit -= off;
	    off = 0;
      }

      assert(wid > 0);

      vvp_vector4_t value = vthread_bits_to_vector(thr, bit, wid);

      vvp_net_ptr_t ptr (cp->net, 0);
      schedule_assign_vector(ptr, off, sig->value_size(), value, delay);

      return true;
}

/*
 * This is %assign/v0/x1/d <label>, <delayx>, <bit>
 * Index register 0 contains a vector part width.
 * Index register 1 contains the offset into the destination vector.
 */
bool of_ASSIGN_V0X1D(vthread_t thr, vvp_code_t cp)
{
      unsigned wid = thr->words[0].w_int;
      long off = thr->words[1].w_int;
      vvp_time64_t delay = thr->words[cp->bit_idx[0]].w_int;
      unsigned bit = cp->bit_idx[1];

      vvp_signal_value*sig = dynamic_cast<vvp_signal_value*> (cp->net->fil);
      assert(sig);

	// We fell off the MSB end.
      if (off >= (long)sig->value_size()) return true;
      else if (off < 0 ) {
	      // We fell off the LSB end.
	    if ((unsigned)-off >= wid ) return true;
	      // Trim the bits before the LSB
	    wid += off;
	    bit -= off;
	    off = 0;
      }

      assert(wid > 0);

      vvp_vector4_t value = vthread_bits_to_vector(thr, bit, wid);

      vvp_net_ptr_t ptr (cp->net, 0);
      schedule_assign_vector(ptr, off, sig->value_size(), value, delay);

      return true;
}

/*
 * This is %assign/v0/x1/e <label>, <bit>
 * Index register 0 contains a vector part width.
 * Index register 1 contains the offset into the destination vector.
 */
bool of_ASSIGN_V0X1E(vthread_t thr, vvp_code_t cp)
{
      unsigned wid = thr->words[0].w_int;
      long off = thr->words[1].w_int;
      unsigned bit = cp->bit_idx[0];

      vvp_signal_value*sig = dynamic_cast<vvp_signal_value*> (cp->net->fil);
      assert(sig);

	// We fell off the MSB end.
      if (off >= (long)sig->value_size()) {
	    thr->event = 0;
	    thr->ecount = 0;
	    return true;
      } else if (off < 0 ) {
	      // We fell off the LSB end.
	    if ((unsigned)-off >= wid ) {
		  thr->event = 0;
		  thr->ecount = 0;
		  return true;
	    }
	      // Trim the bits before the LSB
	    wid += off;
	    bit -= off;
	    off = 0;
      }

      assert(wid > 0);

      vvp_vector4_t value = vthread_bits_to_vector(thr, bit, wid);

      vvp_net_ptr_t ptr (cp->net, 0);
	// If the count is zero then just put the value.
      if (thr->ecount == 0) {
	    schedule_assign_vector(ptr, off, sig->value_size(), value, 0);
      } else {
	    schedule_evctl(ptr, value, off, sig->value_size(), thr->event,
	                   thr->ecount);
      }

      thr->event = 0;
      thr->ecount = 0;

      return true;
}

/*
 * This is %assign/wr <vpi-label>, <delay>, <index>
 *
 * This assigns (after a delay) a value to a real variable. Use the
 * vpi_put_value function to do the assign, with the delay written
 * into the vpiInertialDelay carrying the desired delay.
 */
bool of_ASSIGN_WR(vthread_t thr, vvp_code_t cp)
{
      unsigned delay = cp->bit_idx[0];
      unsigned index = cp->bit_idx[1];
      s_vpi_time del;

      del.type = vpiSimTime;
      vpip_time_to_timestruct(&del, delay);

      struct __vpiHandle*tmp = cp->handle;

      t_vpi_value val;
      val.format = vpiRealVal;
      val.value.real = thr->words[index].w_real;
      vpi_put_value(tmp, &val, &del, vpiTransportDelay);

      return true;
}

bool of_ASSIGN_WRD(vthread_t thr, vvp_code_t cp)
{
      vvp_time64_t delay = thr->words[cp->bit_idx[0]].w_int;
      unsigned index = cp->bit_idx[1];
      s_vpi_time del;

      del.type = vpiSimTime;
      vpip_time_to_timestruct(&del, delay);

      struct __vpiHandle*tmp = cp->handle;

      t_vpi_value val;
      val.format = vpiRealVal;
      val.value.real = thr->words[index].w_real;
      vpi_put_value(tmp, &val, &del, vpiTransportDelay);

      return true;
}

bool of_ASSIGN_WRE(vthread_t thr, vvp_code_t cp)
{
      assert(thr->event != 0);
      unsigned index = cp->bit_idx[0];
      struct __vpiHandle*tmp = cp->handle;

	// If the count is zero then just put the value.
      if (thr->ecount == 0) {
	    t_vpi_value val;

	    val.format = vpiRealVal;
	    val.value.real = thr->words[index].w_real;
	    vpi_put_value(tmp, &val, 0, vpiNoDelay);
      } else {
	    schedule_evctl(tmp, thr->words[index].w_real, thr->event,
	                   thr->ecount);
      }

      thr->event = 0;
      thr->ecount = 0;

      return true;
}

bool of_ASSIGN_X0(vthread_t thr, vvp_code_t cp)
{
#if 0
      unsigned char bit_val = thr_get_bit(thr, cp->bit_idx[1]);
      vvp_ipoint_t itmp = ipoint_index(cp->iptr, thr->words[0].w_int);
      schedule_assign(itmp, bit_val, cp->bit_idx[0]);
#else
      fprintf(stderr, "XXXX forgot how to implement %%assign/x0\n");
#endif
      return true;
}

bool of_BLEND(vthread_t thr, vvp_code_t cp)
{
      assert(cp->bit_idx[0] >= 4);

      unsigned idx1 = cp->bit_idx[0];
      unsigned idx2 = cp->bit_idx[1];

      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
	    vvp_bit4_t lb = thr_get_bit(thr, idx1);
	    vvp_bit4_t rb = thr_get_bit(thr, idx2);

	    if (lb != rb)
		  thr_put_bit(thr, idx1, BIT4_X);

	    idx1 += 1;
	    if (idx2 >= 4)
		  idx2 += 1;
      }

      return true;
}

bool of_BLEND_WR(vthread_t thr, vvp_code_t cp)
{
      double t = thr->words[cp->bit_idx[0]].w_real;
      double f = thr->words[cp->bit_idx[1]].w_real;
      thr->words[cp->bit_idx[0]].w_real = (t == f) ? t : 0.0;
      return true;
}

bool of_BREAKPOINT(vthread_t thr, vvp_code_t cp)
{
      return true;
}

/*
 * the %cassign/link instruction connects a source node to a
 * destination node. The destination node must be a signal, as it is
 * marked with the source of the cassign so that it may later be
 * unlinked without specifically knowing the source that this
 * instruction used.
 */
bool of_CASSIGN_LINK(vthread_t thr, vvp_code_t cp)
{
      vvp_net_t*dst = cp->net;
      vvp_net_t*src = cp->net2;

      vvp_fun_signal_base*sig
	    = dynamic_cast<vvp_fun_signal_base*>(dst->fun);
      assert(sig);

	/* Detect the special case that we are already continuous
	   assigning the source onto the destination. */
      if (sig->cassign_link == src)
	    return true;

	/* If there is an existing cassign driving this node, then
	   unlink it. We can have only 1 cassign at a time. */
      if (sig->cassign_link != 0) {
	    vvp_net_ptr_t tmp (dst, 1);
	    sig->cassign_link->unlink(tmp);
      }

      sig->cassign_link = src;

	/* Link the output of the src to the port[1] (the cassign
	   port) of the destination. */
      vvp_net_ptr_t dst_ptr (dst, 1);
      src->link(dst_ptr);

      return true;
}

/*
 * the %cassign/v instruction invokes a continuous assign of a
 * constant value to a signal. The instruction arguments are:
 *
 *     %cassign/v <net>, <base>, <wid> ;
 *
 * Where the <net> is the net label assembled into a vvp_net pointer,
 * and the <base> and <wid> are stashed in the bit_idx array.
 *
 * This instruction writes vvp_vector4_t values to port-1 of the
 * target signal.
 */
bool of_CASSIGN_V(vthread_t thr, vvp_code_t cp)
{
      vvp_net_t*net  = cp->net;
      unsigned  base = cp->bit_idx[0];
      unsigned  wid  = cp->bit_idx[1];

	/* Collect the thread bits into a vector4 item. */
      vvp_vector4_t value = vthread_bits_to_vector(thr, base, wid);

	/* set the value into port 1 of the destination. */
      vvp_net_ptr_t ptr (net, 1);
      vvp_send_vec4(ptr, value, 0);

      return true;
}

bool of_CASSIGN_WR(vthread_t thr, vvp_code_t cp)
{
      vvp_net_t*net  = cp->net;
      double value = thr->words[cp->bit_idx[0]].w_real;

	/* Set the value into port 1 of the destination. */
      vvp_net_ptr_t ptr (net, 1);
      vvp_send_real(ptr, value, 0);

      return true;
}

bool of_CASSIGN_X0(vthread_t thr, vvp_code_t cp)
{
      vvp_net_t*net = cp->net;
      unsigned base = cp->bit_idx[0];
      unsigned wid = cp->bit_idx[1];

	// Implicitly, we get the base into the target vector from the
	// X0 register.
      long index = thr->words[0].w_int;

      vvp_signal_value*sig = dynamic_cast<vvp_signal_value*> (net->fil);

      if (index < 0 && (wid <= (unsigned)-index))
	    return true;

      if (index >= (long)sig->value_size())
	    return true;

      if (index < 0) {
	    wid -= (unsigned) -index;
	    index = 0;
      }

      if (index+wid > sig->value_size())
	    wid = sig->value_size() - index;

      vvp_vector4_t vector = vthread_bits_to_vector(thr, base, wid);

      vvp_net_ptr_t ptr (net, 1);
      vvp_send_vec4_pv(ptr, vector, index, wid, sig->value_size(), 0);

      return true;
}

bool of_CMPS(vthread_t thr, vvp_code_t cp)
{
      vvp_bit4_t eq  = BIT4_1;
      vvp_bit4_t eeq = BIT4_1;
      vvp_bit4_t lt  = BIT4_0;

      unsigned idx1 = cp->bit_idx[0];
      unsigned idx2 = cp->bit_idx[1];

      const unsigned end1 = (idx1 < 4)? idx1 : idx1 + cp->number - 1;
      const unsigned end2 = (idx2 < 4)? idx2 : idx2 + cp->number - 1;

      if (end1 > end2)
	    thr_check_addr(thr, end1);
      else
	    thr_check_addr(thr, end2);

      const vvp_bit4_t sig1 = thr_get_bit(thr, end1);
      const vvp_bit4_t sig2 = thr_get_bit(thr, end2);

      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
	    vvp_bit4_t lv = thr_get_bit(thr, idx1);
	    vvp_bit4_t rv = thr_get_bit(thr, idx2);

	    if (lv > rv) {
		  lt  = BIT4_0;
		  eeq = BIT4_0;
	    } else if (lv < rv) {
		  lt  = BIT4_1;
		  eeq = BIT4_0;
	    }
	    if (eq != BIT4_X) {
		  if ((lv == BIT4_0) && (rv != BIT4_0))
			eq = BIT4_0;
		  if ((lv == BIT4_1) && (rv != BIT4_1))
			eq = BIT4_0;
		  if (bit4_is_xz(lv) || bit4_is_xz(rv))
			eq = BIT4_X;
	    }

	    if (idx1 >= 4) idx1 += 1;
	    if (idx2 >= 4) idx2 += 1;
      }

      if (eq == BIT4_X)
	    lt = BIT4_X;
      else if ((sig1 == BIT4_1) && (sig2 == BIT4_0))
	    lt = BIT4_1;
      else if ((sig1 == BIT4_0) && (sig2 == BIT4_1))
	    lt = BIT4_0;

	/* Correct the lt bit to account for the sign of the parameters. */
      if (lt != BIT4_X) {
	      /* If the first is negative and the last positive, then
		 a < b for certain. */
	    if ((sig1 == BIT4_1) && (sig2 == BIT4_0))
		  lt = BIT4_1;

	      /* If the first is positive and the last negative, then
		 a > b for certain. */
	    if ((sig1 == BIT4_0) && (sig2 == BIT4_1))
		  lt = BIT4_0;
      }

      thr_put_bit(thr, 4, eq);
      thr_put_bit(thr, 5, lt);
      thr_put_bit(thr, 6, eeq);

      return true;
}

bool of_CMPIS(vthread_t thr, vvp_code_t cp)
{
      vvp_bit4_t eq  = BIT4_1;
      vvp_bit4_t eeq = BIT4_1;
      vvp_bit4_t lt  = BIT4_0;

      unsigned idx1 = cp->bit_idx[0];
      unsigned imm  = cp->bit_idx[1];

      const unsigned end1 = (idx1 < 4)? idx1 : idx1 + cp->number - 1;
      thr_check_addr(thr, end1);
      const vvp_bit4_t sig1 = thr_get_bit(thr, end1);

      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
	    vvp_bit4_t lv = thr_get_bit(thr, idx1);
	    vvp_bit4_t rv = (imm & 1)? BIT4_1 : BIT4_0;
	    imm >>= 1;

	    if (lv > rv) {
		  lt = BIT4_0;
		  eeq = BIT4_0;
	    } else if (lv < rv) {
		  lt = BIT4_1;
		  eeq = BIT4_0;
	    }
	    if (eq != BIT4_X) {
		  if ((lv == BIT4_0) && (rv != BIT4_0))
			eq = BIT4_0;
		  if ((lv == BIT4_1) && (rv != BIT4_1))
			eq = BIT4_0;
		  if (bit4_is_xz(lv) || bit4_is_xz(rv))
			eq = BIT4_X;
	    }

	    if (idx1 >= 4) idx1 += 1;
      }

      if (eq == BIT4_X)
	    lt = BIT4_X;
      else if (sig1 == BIT4_1)
	    lt = BIT4_1;

      thr_put_bit(thr, 4, eq);
      thr_put_bit(thr, 5, lt);
      thr_put_bit(thr, 6, eeq);

      return true;
}

/*
 * The of_CMPIU below punts to this function if there are any xz bits
 * in the vector part of the instruction. In this case we know that
 * there is at least 1 xz bit in the left expression (and there are
 * none in the imm value) so the eeq result must be false. Otherwise,
 * the eq result may me 0 or x, and the lt bit is x.
 */
static bool of_CMPIU_the_hard_way(vthread_t thr, vvp_code_t cp)
{

      unsigned idx1 = cp->bit_idx[0];
      unsigned long imm  = cp->bit_idx[1];
      unsigned wid  = cp->number;
      if (idx1 >= 4)
	    thr_check_addr(thr, idx1+wid-1);

      vvp_bit4_t lv = thr_get_bit(thr, idx1);
      vvp_bit4_t eq  = BIT4_1;
      for (unsigned idx = 0 ;  idx < wid ;  idx += 1) {
	    vvp_bit4_t rv = (imm & 1UL)? BIT4_1 : BIT4_0;
	    imm >>= 1UL;

	    if (bit4_is_xz(lv)) {
		  eq = BIT4_X;
	    } else if (lv != rv) {
		  eq = BIT4_0;
		  break;
	    }

	    if (idx1 >= 4) {
		  idx1 += 1;
		  if ((idx+1) < wid)
			lv = thr_get_bit(thr, idx1);
	    }
      }

      thr_put_bit(thr, 4, eq);
      thr_put_bit(thr, 5, BIT4_X);
      thr_put_bit(thr, 6, BIT4_0);

      return true;
}

bool of_CMPIU(vthread_t thr, vvp_code_t cp)
{
      unsigned addr = cp->bit_idx[0];
      unsigned long imm  = cp->bit_idx[1];
      unsigned wid  = cp->number;

      unsigned long*array = vector_to_array(thr, addr, wid);
	// If there are xz bits in the right hand expression, then we
	// have to do the compare the hard way. That is because even
	// though we know that eeq must be false (the immediate value
	// cannot have x or z bits) we don't know what the EQ or LT
	// bits will be.
      if (array == 0)
	    return of_CMPIU_the_hard_way(thr, cp);

      unsigned words = (wid+CPU_WORD_BITS-1) / CPU_WORD_BITS;
      vvp_bit4_t eq  = BIT4_1;
      vvp_bit4_t lt  = BIT4_0;
      for (unsigned idx = 0 ; idx < words ; idx += 1, imm = 0UL) {
	    if (array[idx] == imm)
		  continue;

	    eq = BIT4_0;
	    lt = (array[idx] < imm) ? BIT4_1 : BIT4_0;
      }

      delete[]array;

      thr_put_bit(thr, 4, eq);
      thr_put_bit(thr, 5, lt);
      thr_put_bit(thr, 6, eq);
      return true;
}

bool of_CMPU_the_hard_way(vthread_t thr, vvp_code_t cp)
{
      vvp_bit4_t eq = BIT4_1;
      vvp_bit4_t eeq = BIT4_1;

      unsigned idx1 = cp->bit_idx[0];
      unsigned idx2 = cp->bit_idx[1];

      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
	    vvp_bit4_t lv = thr_get_bit(thr, idx1);
	    vvp_bit4_t rv = thr_get_bit(thr, idx2);

	    if (lv != rv)
		  eeq = BIT4_0;

	    if (eq==BIT4_1 && (bit4_is_xz(lv) || bit4_is_xz(rv)))
		  eq = BIT4_X;
	    if ((lv == BIT4_0) && (rv==BIT4_1))
		  eq = BIT4_0;
	    if ((lv == BIT4_1) && (rv==BIT4_0))
		  eq = BIT4_0;

	    if (eq == BIT4_0)
		  break;

	    if (idx1 >= 4) idx1 += 1;
	    if (idx2 >= 4) idx2 += 1;

      }

      thr_put_bit(thr, 4, eq);
      thr_put_bit(thr, 5, BIT4_X);
      thr_put_bit(thr, 6, eeq);

      return true;
}

bool of_CMPU(vthread_t thr, vvp_code_t cp)
{
      vvp_bit4_t eq = BIT4_1;
      vvp_bit4_t lt = BIT4_0;

      unsigned idx1 = cp->bit_idx[0];
      unsigned idx2 = cp->bit_idx[1];
      unsigned wid  = cp->number;

      unsigned long*larray = vector_to_array(thr, idx1, wid);
      if (larray == 0) return of_CMPU_the_hard_way(thr, cp);

      unsigned long*rarray = vector_to_array(thr, idx2, wid);
      if (rarray == 0) {
	    delete[]larray;
	    return of_CMPU_the_hard_way(thr, cp);
      }

      unsigned words = (wid+CPU_WORD_BITS-1) / CPU_WORD_BITS;

      for (unsigned wdx = 0 ; wdx < words ; wdx += 1) {
	    if (larray[wdx] == rarray[wdx])
		  continue;

	    eq = BIT4_0;
	    if (larray[wdx] < rarray[wdx])
		  lt = BIT4_1;
	    else
		  lt = BIT4_0;
      }

      delete[]larray;
      delete[]rarray;

      thr_put_bit(thr, 4, eq);
      thr_put_bit(thr, 5, lt);
      thr_put_bit(thr, 6, eq);

      return true;
}

bool of_CMPX(vthread_t thr, vvp_code_t cp)
{
      vvp_bit4_t eq = BIT4_1;

      unsigned idx1 = cp->bit_idx[0];
      unsigned idx2 = cp->bit_idx[1];

      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
	    vvp_bit4_t lv = thr_get_bit(thr, idx1);
	    vvp_bit4_t rv = thr_get_bit(thr, idx2);

	    if ((lv != rv) && !bit4_is_xz(lv) && !bit4_is_xz(rv)) {
		  eq = BIT4_0;
		  break;
	    }

	    if (idx1 >= 4) idx1 += 1;
	    if (idx2 >= 4) idx2 += 1;
      }

      thr_put_bit(thr, 4, eq);

      return true;
}

bool of_CMPWR(vthread_t thr, vvp_code_t cp)
{
      double l = thr->words[cp->bit_idx[0]].w_real;
      double r = thr->words[cp->bit_idx[1]].w_real;

      vvp_bit4_t eq = (l == r)? BIT4_1 : BIT4_0;
      vvp_bit4_t lt = (l <  r)? BIT4_1 : BIT4_0;

      thr_put_bit(thr, 4, eq);
      thr_put_bit(thr, 5, lt);

      return true;
}

bool of_CMPWS(vthread_t thr, vvp_code_t cp)
{
      int64_t l = thr->words[cp->bit_idx[0]].w_int;
      int64_t r = thr->words[cp->bit_idx[1]].w_int;

      vvp_bit4_t eq = (l == r)? BIT4_1 : BIT4_0;
      vvp_bit4_t lt = (l <  r)? BIT4_1 : BIT4_0;

      thr_put_bit(thr, 4, eq);
      thr_put_bit(thr, 5, lt);

      return true;
}

bool of_CMPWU(vthread_t thr, vvp_code_t cp)
{
      uint64_t l = thr->words[cp->bit_idx[0]].w_uint;
      uint64_t r = thr->words[cp->bit_idx[1]].w_uint;

      vvp_bit4_t eq = (l == r)? BIT4_1 : BIT4_0;
      vvp_bit4_t lt = (l <  r)? BIT4_1 : BIT4_0;

      thr_put_bit(thr, 4, eq);
      thr_put_bit(thr, 5, lt);

      return true;
}

bool of_CMPZ(vthread_t thr, vvp_code_t cp)
{
      vvp_bit4_t eq = BIT4_1;

      unsigned idx1 = cp->bit_idx[0];
      unsigned idx2 = cp->bit_idx[1];

      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
	    vvp_bit4_t lv = thr_get_bit(thr, idx1);
	    vvp_bit4_t rv = thr_get_bit(thr, idx2);

	    if ((lv != BIT4_Z) && (rv != BIT4_Z) && (lv != rv)) {
		  eq = BIT4_0;
		  break;
	    }

	    if (idx1 >= 4) idx1 += 1;
	    if (idx2 >= 4) idx2 += 1;
      }

      thr_put_bit(thr, 4, eq);

      return true;
}

bool of_CVT_IR(vthread_t thr, vvp_code_t cp)
{
      double r = thr->words[cp->bit_idx[1]].w_real;
      thr->words[cp->bit_idx[0]].w_int = lround(r);

      return true;
}

bool of_CVT_RI(vthread_t thr, vvp_code_t cp)
{
      long r = thr->words[cp->bit_idx[1]].w_int;
      thr->words[cp->bit_idx[0]].w_real = (double)(r);

      return true;
}

bool of_CVT_VR(vthread_t thr, vvp_code_t cp)
{
      double r = thr->words[cp->bit_idx[1]].w_real;
      unsigned base = cp->bit_idx[0];
      unsigned wid = cp->number;
      vvp_vector4_t tmp(wid, r);
	/* Make sure there is enough space for the new vector. */
      thr_check_addr(thr, base+wid-1);
      thr->bits4.set_vec(base, tmp);

      return true;
}

/*
 * This implements the %deassign instruction. All we do is write a
 * long(1) to port-3 of the addressed net. This turns off an active
 * continuous assign activated by %cassign/v
 */
bool of_DEASSIGN(vthread_t thr, vvp_code_t cp)
{
      vvp_net_t*net = cp->net;
      unsigned base  = cp->bit_idx[0];
      unsigned width = cp->bit_idx[1];

      vvp_signal_value*fil = dynamic_cast<vvp_signal_value*> (net->fil);
      assert(fil);
      vvp_fun_signal_vec*sig = dynamic_cast<vvp_fun_signal_vec*>(net->fun);
      assert(sig);

      if (base >= fil->value_size()) return true;
      if (base+width > fil->value_size()) width = fil->value_size() - base;

      bool full_sig = base == 0 && width == fil->value_size();

	// This is the net that is forcing me...
      if (vvp_net_t*src = sig->cassign_link) {
	    if (!full_sig) {
		  fprintf(stderr, "Sorry: when a signal is assigning a "
		          "register, I cannot deassign part of it.\n");
		  exit(1);
	    }
	      // And this is the pointer to be removed.
	    vvp_net_ptr_t dst_ptr (net, 1);
	    src->unlink(dst_ptr);
	    sig->cassign_link = 0;
      }

	/* Do we release all or part of the net? */
      if (full_sig) {
	    sig->deassign();
      } else {
	    sig->deassign_pv(base, width);
      }

      return true;
}

bool of_DEASSIGN_WR(vthread_t thr, vvp_code_t cp)
{
      vvp_net_t*net = cp->net;

      vvp_fun_signal_real*sig = dynamic_cast<vvp_fun_signal_real*>(net->fun);
      assert(sig);

	// This is the net that is forcing me...
      if (vvp_net_t*src = sig->cassign_link) {
	      // And this is the pointer to be removed.
	    vvp_net_ptr_t dst_ptr (net, 1);
	    src->unlink(dst_ptr);
	    sig->cassign_link = 0;
      }

      sig->deassign();

      return true;
}


/*
 * The delay takes two 32bit numbers to make up a 64bit time.
 *
 *   %delay <low>, <hig>
 */
bool of_DELAY(vthread_t thr, vvp_code_t cp)
{
      vvp_time64_t low = cp->bit_idx[0];
      vvp_time64_t hig = cp->bit_idx[1];

      vvp_time64_t res = 32;
      res = hig << res;
      res += low;

      schedule_vthread(thr, res);
      return false;
}

bool of_DELAYX(vthread_t thr, vvp_code_t cp)
{
      unsigned long delay;

      assert(cp->number < 4);
      delay = thr->words[cp->number].w_int;
      schedule_vthread(thr, delay);
      return false;
}

static bool do_disable(vthread_t thr, vthread_t match)
{
      bool flag = false;

	/* Pull the target thread out of its scope. */
      thr->scope_next->scope_prev = thr->scope_prev;
      thr->scope_prev->scope_next = thr->scope_next;

	/* Turn the thread off by setting is program counter to
	   zero and setting an OFF bit. */
      thr->pc = codespace_null();
      thr->i_have_ended = 1;

	/* Turn off all the children of the thread. Simulate a %join
	   for as many times as needed to clear the results of all the
	   %forks that this thread has done. */
      while (thr->fork_count > 0) {

	    vthread_t tmp = thr->child;
	    assert(tmp);
	    assert(tmp->parent == thr);
	    tmp->schedule_parent_on_end = 0;
	    if (do_disable(tmp, match))
		  flag = true;

	    thr->fork_count -= 1;

	    vthread_reap(tmp);
      }


      if (thr->schedule_parent_on_end) {
	      /* If a parent is waiting in a %join, wake it up. */
	    assert(thr->parent);
	    assert(thr->parent->fork_count > 0);

	    thr->parent->fork_count -= 1;
	    schedule_vthread(thr->parent, 0, true);
	    vthread_reap(thr);

      } else if (thr->parent) {
	      /* If the parent is yet to %join me, let its %join
		 do the reaping. */
	      //assert(tmp->is_scheduled == 0);

      } else {
	      /* No parent at all. Goodbye. */
	    vthread_reap(thr);
      }

      return flag || (thr == match);
}

/*
 * Implement the %disable instruction by scanning the target scope for
 * all the target threads. Kill the target threads and wake up a
 * parent that is attempting a %join.
 */
bool of_DISABLE(vthread_t thr, vvp_code_t cp)
{
      struct __vpiScope*scope = (struct __vpiScope*)cp->handle;
      if (scope->threads == 0)
	    return true;

      struct vthread_s*head = scope->threads;

      bool disabled_myself_flag = false;

      while (head->scope_next != head) {
	    vthread_t tmp = head->scope_next;

	      /* If I am disabling myself, that remember that fact so
		 that I can finish this statement differently. */
	    if (tmp == thr)
		  disabled_myself_flag = true;


	    if (do_disable(tmp, thr))
		  disabled_myself_flag = true;
      }

      return ! disabled_myself_flag;
}

/*
 * This function divides a 2-word number {high, a} by a 1-word
 * number. Assume that high < b.
 */
static unsigned long divide2words(unsigned long a, unsigned long b,
				  unsigned long high)
{
      unsigned long result = 0;
      while (high > 0) {
	    unsigned long tmp_result = ULONG_MAX / b;
	    unsigned long remain = ULONG_MAX % b;

	    remain += 1;
	    if (remain >= b) {
		  remain -= b;
		  result += 1;
	    }

	      // Now 0x1_0...0 = b*tmp_result + remain
	      // high*0x1_0...0 = high*(b*tmp_result + remain)
	      // high*0x1_0...0 = high*b*tmp_result + high*remain

	      // We know that high*0x1_0...0 >= high*b*tmp_result, and
	      // we know that high*0x1_0...0 > high*remain. Use
	      // high*remain as the remainder for another iteration,
	      // and add tmp_result*high into the current estimate of
	      // the result.
	    result += tmp_result * high;

	      // The new iteration starts with high*remain + a.
	    remain = multiply_with_carry(high, remain, high);
	    a = add_with_carry(a, remain, high);

	      // Now result*b + {high,a} == the input {high,a}. It is
	      // possible that the new high >= 1. If so, it will
	      // certainly be less than high from the previous
	      // iteration. Do another iteration and it will shrink,
	      // eventually to 0.
      }

	// high is now 0, so a is the remaining remainder, so we can
	// finish off the integer divide with a simple a/b.

      return result + a/b;
}

static unsigned long* divide_bits(unsigned long*ap, unsigned long*bp, unsigned wid)
{
	// Do all our work a cpu-word at a time. The "words" variable
	// is the number of words of the wid.
      unsigned words = (wid+CPU_WORD_BITS-1) / CPU_WORD_BITS;

      unsigned btop = words-1;
      while (btop > 0 && bp[btop] == 0)
	    btop -= 1;

	// Detect divide by 0, and exit.
      if (btop==0 && bp[0]==0)
	    return 0;

	// The result array will eventually accumulate the result. The
	// diff array is a difference that we use in the intermediate.
      unsigned long*diff  = new unsigned long[words];
      unsigned long*result= new unsigned long[words];
      for (unsigned idx = 0 ; idx < words ; idx += 1)
	    result[idx] = 0;

      for (unsigned cur = words-btop ; cur > 0 ; cur -= 1) {
	    unsigned cur_ptr = cur-1;
	    unsigned long cur_res;
	    if (ap[cur_ptr+btop] >= bp[btop]) {
		  unsigned long high = 0;
		  if (cur_ptr+btop+1 < words)
			high = ap[cur_ptr+btop+1];
		  cur_res = divide2words(ap[cur_ptr+btop], bp[btop], high);

	    } else if (cur_ptr+btop+1 >= words) {
		  continue;

	    } else if (ap[cur_ptr+btop+1] == 0) {
		  continue;

	    } else {
		  cur_res = divide2words(ap[cur_ptr+btop], bp[btop],
					 ap[cur_ptr+btop+1]);
	    }

	      // cur_res is a guestimate of the result this far. It
	      // may be 1 too big. (But it will also be >0) Try it,
	      // and if the difference comes out negative, then adjust.

	      // diff = (bp * cur_res)  << cur_ptr;
	    multiply_array_imm(diff+cur_ptr, bp, words-cur_ptr, cur_res);
	      // ap -= diff
	    unsigned long carry = 1;
	    for (unsigned idx = cur_ptr ; idx < words ; idx += 1)
		  ap[idx] = add_with_carry(ap[idx], ~diff[idx], carry);

	      // ap has the diff subtracted out of it. If cur_res was
	      // too large, then ap will turn negative. (We easily
	      // tell that ap turned negative by looking at
	      // carry&1. If it is 0, then it is *negative*.) In that
	      // case, we know that cur_res was too large by 1. Correct by
	      // adding 1b back in and reducing cur_res.
	    if ((carry&1) == 0) {
		    // Keep adding b back in until the remainder
		    // becomes positive again.
		  do {
			cur_res -= 1;
			carry = 0;
			for (unsigned idx = cur_ptr ; idx < words ; idx += 1)
			      ap[idx] = add_with_carry(ap[idx], bp[idx-cur_ptr], carry);
		  } while (carry == 0);
	    }

	    result[cur_ptr] = cur_res;
      }

	// Now ap contains the remainder and result contains the
	// desired result. We should find that:
	//  input-a = bp * result + ap;

      delete[]diff;
      return result;
}

bool of_DIV(vthread_t thr, vvp_code_t cp)
{
      unsigned adra = cp->bit_idx[0];
      unsigned adrb = cp->bit_idx[1];
      unsigned wid = cp->number;

      assert(adra >= 4);

      unsigned long*ap = vector_to_array(thr, adra, wid);
      if (ap == 0) {
	    vvp_vector4_t tmp(wid, BIT4_X);
	    thr->bits4.set_vec(adra, tmp);
	    return true;
      }

      unsigned long*bp = vector_to_array(thr, adrb, wid);
      if (bp == 0) {
	    delete[]ap;
	    vvp_vector4_t tmp(wid, BIT4_X);
	    thr->bits4.set_vec(adra, tmp);
	    return true;
      }

	// If the value fits in a single CPU word, then do it the easy way.
      if (wid <= CPU_WORD_BITS) {
	    if (bp[0] == 0) {
		  vvp_vector4_t tmp(wid, BIT4_X);
		  thr->bits4.set_vec(adra, tmp);
	    } else {
		  ap[0] /= bp[0];
		  thr->bits4.setarray(adra, wid, ap);
	    }
	    delete[]ap;
	    delete[]bp;
	    return true;
      }

      unsigned long*result = divide_bits(ap, bp, wid);
      if (result == 0) {
	    delete[]ap;
	    delete[]bp;
	    vvp_vector4_t tmp(wid, BIT4_X);
	    thr->bits4.set_vec(adra, tmp);
	    return true;
      }

	// Now ap contains the remainder and result contains the
	// desired result. We should find that:
	//  input-a = bp * result + ap;

      thr->bits4.setarray(adra, wid, result);
      delete[]ap;
      delete[]bp;
      delete[]result;
      return true;
}


static void negate_words(unsigned long*val, unsigned words)
{
      unsigned long carry = 1;
      for (unsigned idx = 0 ; idx < words ; idx += 1)
	    val[idx] = add_with_carry(0, ~val[idx], carry);
}

bool of_DIV_S(vthread_t thr, vvp_code_t cp)
{
      unsigned adra = cp->bit_idx[0];
      unsigned adrb = cp->bit_idx[1];
      unsigned wid = cp->number;
      unsigned words = (wid + CPU_WORD_BITS - 1) / CPU_WORD_BITS;

      assert(adra >= 4);

	// Get the values, left in right, in binary form. If there is
	// a problem with either (caused by an X or Z bit) then we
	// know right away that the entire result is X.
      unsigned long*ap = vector_to_array(thr, adra, wid);
      if (ap == 0) {
	    vvp_vector4_t tmp(wid, BIT4_X);
	    thr->bits4.set_vec(adra, tmp);
	    return true;
      }

      unsigned long*bp = vector_to_array(thr, adrb, wid);
      if (bp == 0) {
	    delete[]ap;
	    vvp_vector4_t tmp(wid, BIT4_X);
	    thr->bits4.set_vec(adra, tmp);
	    return true;
      }

	// Sign extend the bits in the array to fill out the array.
      unsigned long sign_mask = 0;
      if (unsigned long sign_bits = (words*CPU_WORD_BITS) - wid) {
	    sign_mask = -1UL << (CPU_WORD_BITS-sign_bits);
	    if (ap[words-1] & (sign_mask>>1))
		  ap[words-1] |= sign_mask;
	    if (bp[words-1] & (sign_mask>>1))
		  bp[words-1] |= sign_mask;
      }

	// If the value fits in a single word, then use the native divide.
      if (wid <= CPU_WORD_BITS) {
	    if (bp[0] == 0) {
		  vvp_vector4_t tmp(wid, BIT4_X);
		  thr->bits4.set_vec(adra, tmp);
	    } else {
		  long tmpa = (long) ap[0];
		  long tmpb = (long) bp[0];
		  long res = tmpa / tmpb;
		  ap[0] = ((unsigned long)res) & ~sign_mask;
		  thr->bits4.setarray(adra, wid, ap);
	    }
	    delete[]ap;
	    delete[]bp;
	    return true;
      }

	// We need to the actual division to positive integers. Make
	// them positive here, and remember the negations.
      bool negate_flag = false;
      if ( ((long) ap[words-1]) < 0 ) {
	    negate_flag = true;
	    negate_words(ap, words);
      }
      if ( ((long) bp[words-1]) < 0 ) {
	    negate_flag ^= true;
	    negate_words(bp, words);
      }

      unsigned long*result = divide_bits(ap, bp, wid);
      if (result == 0) {
	    delete[]ap;
	    delete[]bp;
	    vvp_vector4_t tmp(wid, BIT4_X);
	    thr->bits4.set_vec(adra, tmp);
	    return true;
      }

      if (negate_flag) {
	    negate_words(result, words);
      }

      result[words-1] &= ~sign_mask;

      thr->bits4.setarray(adra, wid, result);
      delete[]ap;
      delete[]bp;
      delete[]result;
      return true;
}

bool of_DIV_WR(vthread_t thr, vvp_code_t cp)
{
      double l = thr->words[cp->bit_idx[0]].w_real;
      double r = thr->words[cp->bit_idx[1]].w_real;
      thr->words[cp->bit_idx[0]].w_real = l / r;

      return true;
}

/*
 * This terminates the current thread. If there is a parent who is
 * waiting for me to die, then I schedule it. At any rate, I mark
 * myself as a zombie by setting my pc to 0.
 *
 * It is possible for this thread to have children at this %end. This
 * means that my child is really my sibling created by my parent, and
 * my parent will do the proper %joins in due course. For example:
 *
 *     %fork child_1, test;
 *     %fork child_2, test;
 *     ... parent code ...
 *     %join;
 *     %join;
 *     %end;
 *
 *   child_1 ;
 *     %end;
 *   child_2 ;
 *     %end;
 *
 * In this example, the main thread creates threads child_1 and
 * child_2. It is possible that this thread is child_2, so there is a
 * parent pointer and a child pointer, even though I did no
 * %forks or %joins. This means that I have a ->child pointer and a
 * ->parent pointer.
 *
 * If the main thread has executed the first %join, then it is waiting
 * for me, and I will be reaped right away.
 *
 * If the main thread has not executed a %join yet, then this thread
 * becomes a zombie. The main thread executes its %join eventually,
 * reaping me at that time.
 *
 * It does not matter the order that child_1 and child_2 threads call
 * %end -- child_2 will be reaped by the first %join, and child_1 will
 * be reaped by the second %join.
 */
bool of_END(vthread_t thr, vvp_code_t)
{
      assert(! thr->waiting_for_event);
      assert( thr->fork_count == 0 );
      thr->i_have_ended = 1;
      thr->pc = codespace_null();

	/* If I have a parent who is waiting for me, then mark that I
	   have ended, and schedule that parent. Also, finish the
	   %join for the parent. */
      if (thr->schedule_parent_on_end) {
	    assert(thr->parent);
	    assert(thr->parent->fork_count > 0);

	    thr->parent->fork_count -= 1;
	    schedule_vthread(thr->parent, 0, true);
	    vthread_reap(thr);
	    return false;
      }

	/* If I have no parents, then no one can %join me and there is
	   no reason to stick around. This can happen, for example if
	   I am an ``initial'' thread.

	   If I have children at this point, then I must have been the
	   main thread (there is no other parent) and an error (not
	   enough %joins) has been detected. */
      if (thr->parent == 0) {
	    assert(thr->child == 0);
	    vthread_reap(thr);
	    return false;
      }

	/* If I make it this far, then I have a parent who may wish
	   to %join me. Remain a zombie so that it can. */

      return false;
}

bool of_EVCTL(vthread_t thr, vvp_code_t cp)
{
      assert(thr->event == 0 && thr->ecount == 0);
      thr->event = cp->net;
      thr->ecount = thr->words[cp->bit_idx[0]].w_uint;
      return true;
}
bool of_EVCTLC(vthread_t thr, vvp_code_t)
{
      thr->event = 0;
      thr->ecount = 0;
      return true;
}

bool of_EVCTLI(vthread_t thr, vvp_code_t cp)
{
      assert(thr->event == 0 && thr->ecount == 0);
      thr->event = cp->net;
      thr->ecount = cp->bit_idx[0];
      return true;
}

bool of_EVCTLS(vthread_t thr, vvp_code_t cp)
{
      assert(thr->event == 0 && thr->ecount == 0);
      thr->event = cp->net;
      int64_t val = thr->words[cp->bit_idx[0]].w_int;
      if (val < 0) val = 0;
      thr->ecount = val;
      return true;
}

/*
 * the %force/link instruction connects a source node to a
 * destination node. The destination node must be a signal, as it is
 * marked with the source of the force so that it may later be
 * unlinked without specifically knowing the source that this
 * instruction used.
 */
bool of_FORCE_LINK(vthread_t thr, vvp_code_t cp)
{
      vvp_net_t*dst = cp->net;
      vvp_net_t*src = cp->net2;

      assert(dst->fil);
      dst->fil->force_link(dst, src);

      return true;
}

/*
 * The %force/v instruction invokes a force assign of a constant value
 * to a signal. The instruction arguments are:
 *
 *     %force/v <net>, <base>, <wid> ;
 *
 * where the <net> is the net label assembled into a vvp_net pointer,
 * and the <base> and <wid> are stashed in the bit_idx array.
 *
 * The instruction writes a vvp_vector4_t value to port-2 of the
 * target signal.
 */
bool of_FORCE_V(vthread_t thr, vvp_code_t cp)
{
      vvp_net_t*net  = cp->net;
      unsigned  base = cp->bit_idx[0];
      unsigned  wid  = cp->bit_idx[1];

	/* Collect the thread bits into a vector4 item. */
      vvp_vector4_t value = vthread_bits_to_vector(thr, base, wid);

	/* Send the force value to the filter on the node. */

      assert(net->fil);
      if (value.size() != net->fil->filter_size())
	    value = coerce_to_width(value, net->fil->filter_size());

      net->force_vec4(value, vvp_vector2_t(vvp_vector2_t::FILL1, net->fil->filter_size()));

      return true;
}

bool of_FORCE_WR(vthread_t thr, vvp_code_t cp)
{
      vvp_net_t*net  = cp->net;
      double value = thr->words[cp->bit_idx[0]].w_real;

      net->force_real(value, vvp_vector2_t(vvp_vector2_t::FILL1, 1));

      return true;
}


bool of_FORCE_X0(vthread_t thr, vvp_code_t cp)
{
      vvp_net_t*net = cp->net;
      unsigned base = cp->bit_idx[0];
      unsigned wid = cp->bit_idx[1];

      assert(net->fil);

	// Implicitly, we get the base into the target vector from the
	// X0 register.
      long index = thr->words[0].w_int;

      if (index < 0 && (wid <= (unsigned)-index))
	    return true;

      if (index < 0) {
	    wid -= (unsigned) -index;
	    index = 0;
      }

      unsigned use_size = net->fil->filter_size();


      if (index >= (long)use_size)
	    return true;

      if (index+wid > use_size)
	    wid = use_size - index;

      vvp_vector2_t mask(vvp_vector2_t::FILL0, use_size);
      for (unsigned idx = 0 ; idx < wid ; idx += 1)
	    mask.set_bit(index+idx, 1);

      vvp_vector4_t vector = vthread_bits_to_vector(thr, base, wid);
      vvp_vector4_t value(use_size, BIT4_Z);
      value.set_vec(index, vector);

      net->force_vec4(value, mask);

      return true;
}

/*
 * The %fork instruction causes a new child to be created and pushed
 * in front of any existing child. This causes the new child to be the
 * parent of any previous children, and for me to be the parent of the
 * new child.
 */
bool of_FORK(vthread_t thr, vvp_code_t cp)
{
      vthread_t child = vthread_new(cp->cptr2, cp->scope);
      if (cp->scope->is_automatic) {
              /* The context allocated for this child is the top entry
                 on the write context stack. */
            child->wt_context = thr->wt_context;
            child->rd_context = thr->wt_context;
      }

      child->child  = thr->child;
      child->parent = thr;
      thr->child = child;
      if (child->child) {
	    assert(child->child->parent == thr);
	    child->child->parent = child;
      }

      thr->fork_count += 1;

	/* If the new child was created to evaluate a function,
	   run it immediately, then return to this thread. */
      if (cp->scope->base.vpi_type->type_code == vpiFunction) {
	    child->is_scheduled = 1;
	    vthread_run(child);
            running_thread = thr;
      } else {
	    schedule_vthread(child, 0, true);
      }

      return true;
}

bool of_FREE(vthread_t thr, vvp_code_t cp)
{
        /* Pop the child context from the read context stack. */
      vvp_context_t child_context = thr->rd_context;
      thr->rd_context = vvp_get_stacked_context(child_context);

        /* Free the context. */
      vthread_free_context(child_context, cp->scope);

      return true;
}

static bool of_INV_wide(vthread_t thr, vvp_code_t cp)
{
      unsigned idx1 = cp->bit_idx[0];
      unsigned wid = cp->bit_idx[1];

      vvp_vector4_t val = vthread_bits_to_vector(thr, idx1, wid);
      thr->bits4.set_vec(idx1, ~val);

      return true;
}

static bool of_INV_narrow(vthread_t thr, vvp_code_t cp)
{
      unsigned idx1 = cp->bit_idx[0];
      unsigned wid = cp->bit_idx[1];

      for (unsigned idx = 0 ; idx < wid ; idx += 1) {
	    vvp_bit4_t lb = thr_get_bit(thr, idx1);
	    thr_put_bit(thr, idx1, ~lb);
	    idx1 += 1;
      }

      return true;
}

bool of_INV(vthread_t thr, vvp_code_t cp)
{
      assert(cp->bit_idx[0] >= 4);

      if (cp->number <= 4)
	    cp->opcode = &of_INV_narrow;
      else
	    cp->opcode = &of_INV_wide;

      return cp->opcode(thr, cp);
}


/*
 * Index registers, arithmetic.
 */

static inline int64_t get_as_64_bit(uint32_t low_32, uint32_t high_32)
{
      int64_t low = low_32;
      int64_t res = high_32;

      res <<= 32;
      res |= low;
      return res;
}

bool of_IX_ADD(vthread_t thr, vvp_code_t cp)
{
      thr->words[cp->number].w_int += get_as_64_bit(cp->bit_idx[0],
                                                    cp->bit_idx[1]);
      return true;
}

bool of_IX_SUB(vthread_t thr, vvp_code_t cp)
{
      thr->words[cp->number].w_int -= get_as_64_bit(cp->bit_idx[0],
                                                    cp->bit_idx[1]);
      return true;
}

bool of_IX_MUL(vthread_t thr, vvp_code_t cp)
{
      thr->words[cp->number].w_int *= get_as_64_bit(cp->bit_idx[0],
                                                    cp->bit_idx[1]);
      return true;
}

bool of_IX_LOAD(vthread_t thr, vvp_code_t cp)
{
      thr->words[cp->number].w_int = get_as_64_bit(cp->bit_idx[0],
                                                   cp->bit_idx[1]);
      return true;
}

/*
 * Load a vector into an index register. The format of the
 * opcode is:
 *
 *   %ix/get <ix>, <base>, <wid>
 *
 * where <ix> is the index register, <base> is the base of the
 * vector and <wid> is the width in bits.
 *
 * Index registers only hold binary values, so if any of the
 * bits of the vector are x or z, then set the value to 0,
 * set bit[4] to 1, and give up.
 */
bool of_IX_GET(vthread_t thr, vvp_code_t cp)
{
      unsigned index = cp->bit_idx[0];
      unsigned base  = cp->bit_idx[1];
      unsigned width = cp->number;

      unsigned long*array = vector_to_array(thr, base, width);
      if (array == 0) {
	      /* If there are unknowns in the vector bits, then give
		 up immediately. Set the value to 0, and set thread
		 bit 4 to 1 to flag the error. */
	    thr->words[index].w_int = 0;
	    thr_put_bit(thr, 4, BIT4_1);
	    return true;
      }

      thr->words[index].w_int = array[0];
      thr_put_bit(thr, 4, BIT4_0);
      delete[]array;
      return true;
}

bool of_IX_GET_S(vthread_t thr, vvp_code_t cp)
{
      unsigned index = cp->bit_idx[0];
      unsigned base  = cp->bit_idx[1];
      unsigned width = cp->number;

      uint64_t v = 0;
      bool unknown_flag = false;

      vvp_bit4_t vv = BIT4_0;
      for (unsigned i = 0 ;  i<width ;  i += 1) {
	    vv = thr_get_bit(thr, base);
	    if (bit4_is_xz(vv)) {
		  v = 0UL;
		  unknown_flag = true;
		  break;
	    }

	    v |= (uint64_t) vv << i;

	    if (base >= 4)
		  base += 1;
      }

	/* Sign-extend to fill the integer value. */
      if (!unknown_flag) {
	    uint64_t pad = vv;
	    for (unsigned i = width ; i < 8*sizeof(v) ;  i += 1) {
		  v |= pad << i;
	    }
      }

      thr->words[index].w_int = v;

	/* Set bit 4 as a flag if the input is unknown. */
      thr_put_bit(thr, 4, unknown_flag? BIT4_1 : BIT4_0);

      return true;
}

bool of_IX_GETV(vthread_t thr, vvp_code_t cp)
{
      unsigned index = cp->bit_idx[0];
      vvp_net_t*net = cp->net;

      vvp_signal_value*sig = dynamic_cast<vvp_signal_value*>(net->fil);
      if (sig == 0) {
	    assert(net->fil);
	    cerr << "%%ix/getv error: Net arg not a vector signal? "
		 << typeid(*net->fil).name() << endl;
      }
      assert(sig);

      vvp_vector4_t vec = sig->vec4_value();
      unsigned long val;
      bool known_flag = vector4_to_value(vec, val);

      if (known_flag)
	    thr->words[index].w_int = val;
      else
	    thr->words[index].w_int = 0;

	/* Set bit 4 as a flag if the input is unknown. */
      thr_put_bit(thr, 4, known_flag? BIT4_0 : BIT4_1);

      return true;
}

bool of_IX_GETVS(vthread_t thr, vvp_code_t cp)
{
      unsigned index = cp->bit_idx[0];
      vvp_net_t*net = cp->net;

      vvp_signal_value*sig = dynamic_cast<vvp_signal_value*>(net->fil);
      if (sig == 0) {
	    cerr << "%%ix/getv/s error: Net arg not a vector signal? "
		 << "fun=" << typeid(*net->fil).name()
		 << ", fil=" << (net->fil? typeid(*net->fil).name() : "<>")
		 << endl;
      }
      assert(sig);

      vvp_vector4_t vec = sig->vec4_value();
      long val;
      bool known_flag = vector4_to_value(vec, val, true, true);

      if (known_flag)
	    thr->words[index].w_int = val;
      else
	    thr->words[index].w_int = 0;

	/* Set bit 4 as a flag if the input is unknown. */
      thr_put_bit(thr, 4, known_flag? BIT4_0 : BIT4_1);

      return true;
}

/*
 * The various JMP instruction work simply by pulling the new program
 * counter from the instruction and resuming. If the jump is
 * conditional, then test the bit for the expected value first.
 */
bool of_JMP(vthread_t thr, vvp_code_t cp)
{
      thr->pc = cp->cptr;

	/* Normally, this returns true so that the processor just
	   keeps going to the next instruction. However, if there was
	   a $stop or vpiStop, returning false here can break the
	   simulation out of a hung loop. */
      if (schedule_stopped()) {
	    schedule_vthread(thr, 0, false);
	    return false;
      }

      return true;
}

bool of_JMP0(vthread_t thr, vvp_code_t cp)
{
      if (thr_get_bit(thr, cp->bit_idx[0]) == 0)
	    thr->pc = cp->cptr;

	/* Normally, this returns true so that the processor just
	   keeps going to the next instruction. However, if there was
	   a $stop or vpiStop, returning false here can break the
	   simulation out of a hung loop. */
      if (schedule_stopped()) {
	    schedule_vthread(thr, 0, false);
	    return false;
      }

      return true;
}

bool of_JMP0XZ(vthread_t thr, vvp_code_t cp)
{
      if (thr_get_bit(thr, cp->bit_idx[0]) != BIT4_1)
	    thr->pc = cp->cptr;

	/* Normally, this returns true so that the processor just
	   keeps going to the next instruction. However, if there was
	   a $stop or vpiStop, returning false here can break the
	   simulation out of a hung loop. */
      if (schedule_stopped()) {
	    schedule_vthread(thr, 0, false);
	    return false;
      }

      return true;
}

bool of_JMP1(vthread_t thr, vvp_code_t cp)
{
      if (thr_get_bit(thr, cp->bit_idx[0]) == 1)
	    thr->pc = cp->cptr;

	/* Normally, this returns true so that the processor just
	   keeps going to the next instruction. However, if there was
	   a $stop or vpiStop, returning false here can break the
	   simulation out of a hung loop. */
      if (schedule_stopped()) {
	    schedule_vthread(thr, 0, false);
	    return false;
      }

      return true;
}

/*
 * The %join instruction causes the thread to wait for the one and
 * only child to die.  If it is already dead (and a zombie) then I
 * reap it and go on. Otherwise, I tell the child that I am ready for
 * it to die, and it will reschedule me when it does.
 */
bool of_JOIN(vthread_t thr, vvp_code_t cp)
{
      assert(thr->child);
      assert(thr->child->parent == thr);

      assert(thr->fork_count > 0);

        /* If the child thread is in an automatic scope... */
      if (thr->child->wt_context) {
              /* and is the top level task/function thread... */
            if (thr->wt_context != thr->rd_context) {
                    /* Pop the child context from the write context stack. */
                  vvp_context_t child_context = thr->wt_context;
                  thr->wt_context = vvp_get_stacked_context(child_context);

                    /* Push the child context onto the read context stack */
                  vvp_set_stacked_context(child_context, thr->rd_context);
                  thr->rd_context = child_context;
            }
      }

	/* If the child has already ended, reap it now. */
      if (thr->child->i_have_ended) {
	    thr->fork_count -= 1;
	    vthread_reap(thr->child);
	    return true;
      }

	/* Otherwise, I get to start waiting. */
      thr->child->schedule_parent_on_end = 1;
      return false;
}

/*
 * %load/ar <bit>, <array-label>, <index>;
*/
bool of_LOAD_AR(vthread_t thr, vvp_code_t cp)
{
      unsigned bit = cp->bit_idx[0];
      unsigned idx = cp->bit_idx[1];
      unsigned adr = thr->words[idx].w_int;
      double word;

	/* The result is 0.0 if the address is undefined. */
      if (thr_get_bit(thr, 4) == BIT4_1) {
	    word = 0.0;
      } else {
	    word = array_get_word_r(cp->array, adr);
      }

      thr->words[bit].w_real = word;
      return true;
}

/*
 * %load/av <bit>, <array-label>, <wid> ;
 *
 * <bit> is the thread bit address for the result
 * <array-label> is the array to access, and
 * <wid> is the width of the word to read.
 *
 * The address of the word in the array is in index register 3.
 */
bool of_LOAD_AV(vthread_t thr, vvp_code_t cp)
{
      unsigned bit = cp->bit_idx[0];
      unsigned wid = cp->bit_idx[1];
      unsigned adr = thr->words[3].w_int;

	/* Check the address once, before we scan the vector. */
      thr_check_addr(thr, bit+wid-1);

	/* The result is 'bx if the address is undefined. */
      if (thr_get_bit(thr, 4) == BIT4_1) {
	    vvp_vector4_t tmp (wid, BIT4_X);
	    thr->bits4.set_vec(bit, tmp);
	    return true;
      }

      vvp_vector4_t word = array_get_word(cp->array, adr);

      if (word.size() > wid)
	    word.resize(wid);

	/* Copy the vector bits into the bits4 vector. Do the copy
	   directly to skip the excess calls to thr_check_addr. */
      thr->bits4.set_vec(bit, word);

	/* If the source is shorter then the desired width, then pad
	   with BIT4_X values. */
      for (unsigned idx = word.size() ; idx < wid ; idx += 1)
	    thr->bits4.set_bit(bit+idx, BIT4_X);

      return true;
}

/*
 * %load/vp0, %load/vp0/s, %load/avp0 and %load/avp0/s share this function.
*/
static void load_vp0_common(vthread_t thr, vvp_code_t cp, const vvp_vector4_t&sig_value)
{
      unsigned bit = cp->bit_idx[0];
      unsigned wid = cp->bit_idx[1];
      int64_t addend = thr->words[0].w_int;

	/* Check the address once, before we scan the vector. */
      thr_check_addr(thr, bit+wid-1);

      unsigned long*val = sig_value.subarray(0, wid);
      if (val == 0) {
	    vvp_vector4_t tmp(wid, BIT4_X);
	    thr->bits4.set_vec(bit, tmp);
	    return;
      }

      unsigned words = (wid + CPU_WORD_BITS - 1) / CPU_WORD_BITS;
      unsigned long carry = 0;
      unsigned long imm = addend;
      if (addend >= 0) {
	    for (unsigned idx = 0 ; idx < words ; idx += 1) {
		  val[idx] = add_with_carry(val[idx], imm, carry);
		  imm = 0UL;
	    }
      } else {
	    for (unsigned idx = 0 ; idx < words ; idx += 1) {
		  val[idx] = add_with_carry(val[idx], imm, carry);
		  imm = -1UL;
	    }
      }

	/* Copy the vector bits into the bits4 vector. Do the copy
	   directly to skip the excess calls to thr_check_addr. */
      thr->bits4.setarray(bit, wid, val);
      delete[]val;
}

/*
 * %load/avp0 <bit>, <array-label>, <wid> ;
 *
 * <bit> is the thread bit address for the result
 * <array-label> is the array to access, and
 * <wid> is the width of the word to read.
 *
 * The address of the word in the array is in index register 3.
 * An integer value from index register 0 is added to the value.
 */
bool of_LOAD_AVP0(vthread_t thr, vvp_code_t cp)
{
      unsigned wid = cp->bit_idx[1];
      unsigned adr = thr->words[3].w_int;

	/* The result is 'bx if the address is undefined. */
      if (thr_get_bit(thr, 4) == BIT4_1) {
	    unsigned bit = cp->bit_idx[0];
	    thr_check_addr(thr, bit+wid-1);
	    vvp_vector4_t tmp (wid, BIT4_X);
	    thr->bits4.set_vec(bit, tmp);
	    return true;
      }

        /* We need a vector this wide to make the math work correctly.
         * Copy the base bits into the vector, but keep the width. */
      vvp_vector4_t sig_value(wid, BIT4_0);
      sig_value.copy_bits(array_get_word(cp->array, adr));

      load_vp0_common(thr, cp, sig_value);
      return true;
}

bool of_LOAD_AVP0_S(vthread_t thr, vvp_code_t cp)
{
      unsigned wid = cp->bit_idx[1];
      unsigned adr = thr->words[3].w_int;

	/* The result is 'bx if the address is undefined. */
      if (thr_get_bit(thr, 4) == BIT4_1) {
	    unsigned bit = cp->bit_idx[0];
	    thr_check_addr(thr, bit+wid-1);
	    vvp_vector4_t tmp (wid, BIT4_X);
	    thr->bits4.set_vec(bit, tmp);
	    return true;
      }

      vvp_vector4_t tmp (array_get_word(cp->array, adr));

        /* We need a vector this wide to make the math work correctly.
         * Copy the base bits into the vector, but keep the width. */
      vvp_vector4_t sig_value(wid, tmp.value(tmp.size()-1));
      sig_value.copy_bits(tmp);

      load_vp0_common(thr, cp, sig_value);
      return true;
}

/*
 * %load/avx.p <bit>, <array-label>, <idx> ;
 *
 * <bit> is the thread bit address for the result
 * <array-label> is the array to access, and
 * <wid> is the width of the word to read.
 *
 * The address of the word in the array is in index register 3.
 */
bool of_LOAD_AVX_P(vthread_t thr, vvp_code_t cp)
{
      unsigned bit = cp->bit_idx[0];
      unsigned index = cp->bit_idx[1];
      unsigned adr = thr->words[3].w_int;

	/* The result is 'bx if the address is undefined. */
      if (thr_get_bit(thr, 4) == BIT4_1) {
	    thr_put_bit(thr, bit, BIT4_X);
	    return true;
      }

      long use_index = thr->words[index].w_int;

      vvp_vector4_t word = array_get_word(cp->array, adr);

      if ((use_index >= (long)word.size()) || (use_index < 0)) {
	    thr_put_bit(thr, bit, BIT4_X);
      } else {
	    thr_put_bit(thr, bit, word.value(use_index));
      }

      thr->words[index].w_int = use_index + 1;

      return true;
}

/* %load/v <bit>, <label>, <wid>
 *
 * Implement the %load/v instruction. Load the vector value of the
 * requested width from the <label> functor starting in the thread bit
 * <bit>.
 *
 * The <bit> value is the destination in the thread vector store, and
 * is in cp->bit_idx[0].
 *
 * The <wid> value is the expected with of the vector, and is in
 * cp->bit_idx[1].
 *
 * The functor to read from is the vvp_net_t object pointed to by the
 * cp->net pointer.
 */
static vvp_vector4_t load_base(vthread_t thr, vvp_code_t cp)
{
      vvp_net_t*net = cp->net;

	/* For the %load to work, the functor must actually be a
	   signal functor. Only signals save their vector value. */
      vvp_signal_value*sig = dynamic_cast<vvp_signal_value*> (net->fil);
      if (sig == 0) {
	    cerr << "%%load/v error: Net arg not a signal? "
		 << typeid(*net->fil).name() << endl;
	    assert(sig);
      }

      return sig->vec4_value();
}

bool of_LOAD_VEC(vthread_t thr, vvp_code_t cp)
{
      unsigned bit = cp->bit_idx[0];
      unsigned wid = cp->bit_idx[1];

      vvp_vector4_t sig_value = load_base(thr, cp);

	/* Check the address once, before we scan the vector. */
      thr_check_addr(thr, bit+wid-1);

      if (sig_value.size() > wid)
	    sig_value.resize(wid);

	/* Copy the vector bits into the bits4 vector. Do the copy
	   directly to skip the excess calls to thr_check_addr. */
      thr->bits4.set_vec(bit, sig_value);

	/* If the source is shorter then the desired width, then pad
	   with BIT4_X values. */
      for (unsigned idx = sig_value.size() ; idx < wid ; idx += 1)
	    thr->bits4.set_bit(bit+idx, BIT4_X);

      return true;
}

/*
 * This is like of_LOAD_VEC, but includes an add of an integer value from
 * index 0. The <wid> is the expected result width not the vector width.
 */

bool of_LOAD_VP0(vthread_t thr, vvp_code_t cp)
{
      unsigned wid = cp->bit_idx[1];

        /* We need a vector this wide to make the math work correctly.
         * Copy the base bits into the vector, but keep the width. */
      vvp_vector4_t sig_value(wid, BIT4_0);
      sig_value.copy_bits(load_base(thr, cp));

      load_vp0_common(thr, cp, sig_value);
      return true;
}

bool of_LOAD_VP0_S(vthread_t thr, vvp_code_t cp)
{
      unsigned wid = cp->bit_idx[1];

      vvp_vector4_t tmp (load_base(thr, cp));

        /* We need a vector this wide to make the math work correctly.
         * Copy the base bits into the vector, but keep the width. */
      vvp_vector4_t sig_value(wid, tmp.value(tmp.size()-1));
      sig_value.copy_bits(tmp);

      load_vp0_common(thr, cp, sig_value);
      return true;
}

bool of_LOAD_WR(vthread_t thr, vvp_code_t cp)
{
      struct __vpiHandle*tmp = cp->handle;
      t_vpi_value val;

      val.format = vpiRealVal;
      vpi_get_value(tmp, &val);

      thr->words[cp->bit_idx[0]].w_real = val.value.real;

      return true;
}

/*
 * %load/x16 <bit>, <functor>, <wid>
 *
 * <bit> is the destination thread bit and must be >= 4.
 */
bool of_LOAD_X1P(vthread_t thr, vvp_code_t cp)
{
	// <bit> is the thread bit to load
      assert(cp->bit_idx[0] >= 4);
      unsigned bit = cp->bit_idx[0];
      int wid = cp->bit_idx[1];

	// <index> is the canonical base address of the part select.
      long index = thr->words[1].w_int;

	// <functor> is converted to a vvp_net_t pointer from which we
	// read our value.
      vvp_net_t*net = cp->net;

	// For the %load to work, the functor must actually be a
	// signal functor. Only signals save their vector value.
      vvp_signal_value*sig = dynamic_cast<vvp_signal_value*> (net->fil);
      assert(sig);

      for (long idx = 0 ; idx < wid ; idx += 1) {
	    long use_index = index + idx;
	    vvp_bit4_t val;
	    if (use_index < 0 || use_index >= (signed)sig->value_size())
		  val = BIT4_X;
	    else
		  val = sig->value(use_index);

	    thr_put_bit(thr, bit+idx, val);
      }

      return true;
}

bool of_LOADI_WR(vthread_t thr, vvp_code_t cp)
{
      unsigned idx = cp->bit_idx[0];
      double mant = cp->number;
      int exp = cp->bit_idx[1];

	// Detect +infinity
      if (exp==0x3fff && cp->number==0) {
	    thr->words[idx].w_real = INFINITY;
	    return true;
      }
	// Detect -infinity
      if (exp==0x7fff && cp->number==0) {
	    thr->words[idx].w_real = -INFINITY;
	    return true;
      }
	// Detect NaN
      if (exp==0x3fff) {
	    thr->words[idx].w_real = nan("");
	    return true;
      }

      double sign = (exp & 0x4000)? -1.0 : 1.0;

      exp &= 0x1fff;

      mant = sign * ldexp(mant, exp - 0x1000);
      thr->words[idx].w_real = mant;
      return true;
}

static void do_verylong_mod(vthread_t thr, vvp_code_t cp,
			    bool left_is_neg, bool right_is_neg)
{
      bool out_is_neg = left_is_neg;
      int len=cp->number;
      unsigned char *a, *z, *t;
      a = new unsigned char[len+1];
      z = new unsigned char[len+1];
      t = new unsigned char[len+1];

      unsigned char carry;
      unsigned char temp;

      int mxa = -1, mxz = -1;
      int i;
      int current, copylen;

      unsigned idx1 = cp->bit_idx[0];
      unsigned idx2 = cp->bit_idx[1];

      unsigned lb_carry = left_is_neg? 1 : 0;
      unsigned rb_carry = right_is_neg? 1 : 0;
      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
	    unsigned lb = thr_get_bit(thr, idx1);
	    unsigned rb = thr_get_bit(thr, idx2);

	    if ((lb | rb) & 2) {
		  delete []t;
		  delete []z;
		  delete []a;
		  goto x_out;
	    }

	    if (left_is_neg) {
		  lb = (1-lb) + lb_carry;
		  lb_carry = (lb & ~1)? 1 : 0;
		  lb &= 1;
	    }
	    if (right_is_neg) {
		  rb = (1-rb) + rb_carry;
		  rb_carry = (rb & ~1)? 1 : 0;
		  rb &= 1;
	    }

	    z[idx]=lb;
	    a[idx]=1-rb;	// for 2s complement add..

	    idx1 += 1;
	    if (idx2 >= 4)
		  idx2 += 1;
      }

      z[len]=0;
      a[len]=1;

      for(i=len-1;i>=0;i--) {
	    if(!a[i]) {
		  mxa=i;
		  break;
	    }
      }

      for(i=len-1;i>=0;i--) {
	    if(z[i]) {
		  mxz=i;
		  break;
	    }
      }

      if((mxa>mxz)||(mxa==-1)) {
	    if(mxa==-1) {
		  delete []t;
		  delete []z;
		  delete []a;
		  goto x_out;
	    }

	    goto tally;
      }

      copylen = mxa + 2;
      current = mxz - mxa;

      while(current > -1) {
	    carry = 1;
	    for(i=0;i<copylen;i++) {
		  temp = z[i+current] + a[i] + carry;
		  t[i] = (temp&1);
		  carry = (temp>>1);
	    }

	    if(carry) {
		  for(i=0;i<copylen;i++) {
			z[i+current] = t[i];
		  }
	    }

	    current--;
      }

 tally:

      carry = out_is_neg? 1 : 0;
      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
	    unsigned ob = z[idx];
	    if (out_is_neg) {
		  ob = (1-ob) + carry;
		  carry = (ob & ~1)? 1 : 0;
		  ob = ob & 1;
	    }
	    thr_put_bit(thr, cp->bit_idx[0]+idx, ob?BIT4_1:BIT4_0);
      }

      delete []t;
      delete []z;
      delete []a;
      return;

 x_out:
      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1)
	    thr_put_bit(thr, cp->bit_idx[0]+idx, BIT4_X);

      return;
}

bool of_MOD(vthread_t thr, vvp_code_t cp)
{
      assert(cp->bit_idx[0] >= 4);

      if(cp->number <= 8*sizeof(unsigned long long)) {
	    unsigned idx1 = cp->bit_idx[0];
	    unsigned idx2 = cp->bit_idx[1];
	    unsigned long long lv = 0, rv = 0;

	    for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
		  unsigned long long lb = thr_get_bit(thr, idx1);
		  unsigned long long rb = thr_get_bit(thr, idx2);

		  if ((lb | rb) & 2)
			goto x_out;

		  lv |= (unsigned long long) lb << idx;
		  rv |= (unsigned long long) rb << idx;

		  idx1 += 1;
		  if (idx2 >= 4)
			idx2 += 1;
	    }

	    if (rv == 0)
		  goto x_out;

	    lv %= rv;

	    for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
		  thr_put_bit(thr, cp->bit_idx[0]+idx, (lv&1)?BIT4_1 : BIT4_0);
		  lv >>= 1;
	    }

	    return true;

      } else {
	    do_verylong_mod(thr, cp, false, false);
	    return true;
      }

 x_out:
      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1)
	    thr_put_bit(thr, cp->bit_idx[0]+idx, BIT4_X);

      return true;
}

bool of_MOD_S(vthread_t thr, vvp_code_t cp)
{
      assert(cp->bit_idx[0] >= 4);

	/* Handle the case that we can fit the bits into a long-long
	   variable. We cause use native % to do the work. */
      if(cp->number <= 8*sizeof(long long)) {
	    unsigned idx1 = cp->bit_idx[0];
	    unsigned idx2 = cp->bit_idx[1];
	    long long lv = 0, rv = 0;

	    for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
		  long long lb = thr_get_bit(thr, idx1);
		  long long rb = thr_get_bit(thr, idx2);

		  if ((lb | rb) & 2)
			goto x_out;

		  lv |= (long long) lb << idx;
		  rv |= (long long) rb << idx;

		  idx1 += 1;
		  if (idx2 >= 4)
			idx2 += 1;
	    }

	    if (rv == 0)
		  goto x_out;

	      /* Sign extend the signed operands. */
	    if (lv & (1LL << (cp->number-1)))
		  lv |= -1LL << cp->number;
	    if (rv & (1LL << (cp->number-1)))
		  rv |= -1LL << cp->number;

	    lv %= rv;

	    for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
		  thr_put_bit(thr, cp->bit_idx[0]+idx, (lv&1)?BIT4_1:BIT4_0);
		  lv >>= 1;
	    }

	    return true;

      } else {

	    bool left_is_neg
		  = thr_get_bit(thr,cp->bit_idx[0]+cp->number-1) == 1;
	    bool right_is_neg
		  = thr_get_bit(thr,cp->bit_idx[1]+cp->number-1) == 1;
	    do_verylong_mod(thr, cp, left_is_neg, right_is_neg);
	    return true;
      }

 x_out:
      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1)
	    thr_put_bit(thr, cp->bit_idx[0]+idx, BIT4_X);

      return true;
}

/*
 * %mod/wr <dest>, <src>
 */
bool of_MOD_WR(vthread_t thr, vvp_code_t cp)
{
      double l = thr->words[cp->bit_idx[0]].w_real;
      double r = thr->words[cp->bit_idx[1]].w_real;
      thr->words[cp->bit_idx[0]].w_real = fmod(l,r);

      return true;
}

/*
 * %mov <dest>, <src>, <wid>
 *   This instruction is implemented by the of_MOV function
 *   below. However, during runtime vvp might notice that the
 *   parameters have certain properties that make it possible to
 *   replace the of_MOV opcode with a more specific instruction that
 *   more directly does the job. All the of_MOV*_ functions are
 *   functions that of_MOV might use to replace itself.
 */

static bool of_MOV1XZ_(vthread_t thr, vvp_code_t cp)
{
      thr_check_addr(thr, cp->bit_idx[0]+cp->number-1);
      vvp_vector4_t tmp (cp->number, thr_index_to_bit4[cp->bit_idx[1]]);
      thr->bits4.set_vec(cp->bit_idx[0], tmp);
      return true;
}

static bool of_MOV_(vthread_t thr, vvp_code_t cp)
{
	/* This variant implements the general case that we know
	   neither the source nor the destination to be <4. Otherwise,
	   we copy all the bits manually. */

      thr_check_addr(thr, cp->bit_idx[0]+cp->number-1);
      thr_check_addr(thr, cp->bit_idx[1]+cp->number-1);
	// Read the source vector out
      vvp_vector4_t tmp (thr->bits4, cp->bit_idx[1], cp->number);
	// Write it in the new place.
      thr->bits4.set_vec(cp->bit_idx[0], tmp);

      return true;
}

bool of_MOV(vthread_t thr, vvp_code_t cp)
{
      assert(cp->bit_idx[0] >= 4);

      if (cp->bit_idx[1] >= 4) {
	    cp->opcode = &of_MOV_;
	    return cp->opcode(thr, cp);

      } else {
	    cp->opcode = &of_MOV1XZ_;
	    return cp->opcode(thr, cp);
      }

      return true;
}

/*
*  %mov/wr <dst>, <src>
*/
bool of_MOV_WR(vthread_t thr, vvp_code_t cp)
{
      unsigned dst = cp->bit_idx[0];
      unsigned src = cp->bit_idx[1];

      thr->words[dst].w_real = thr->words[src].w_real;
      return true;
}

bool of_MOVI(vthread_t thr, vvp_code_t cp)
{
      unsigned dst = cp->bit_idx[0];
      static unsigned long val[8] = {0, 0, 0, 0, 0, 0, 0, 0};
      unsigned wid = cp->number;

      thr_check_addr(thr, dst+wid-1);

      val[0] = cp->bit_idx[1];

      while (wid > 0) {
	    unsigned trans = wid;
	    if (trans > 8*CPU_WORD_BITS)
		  trans = 8*CPU_WORD_BITS;

	    thr->bits4.setarray(dst, trans, val);

	    val[0] = 0;
	    wid -= trans;
	    dst += trans;
      }

      return true;
}

bool of_MUL(vthread_t thr, vvp_code_t cp)
{
      unsigned adra = cp->bit_idx[0];
      unsigned adrb = cp->bit_idx[1];
      unsigned wid = cp->number;

      assert(adra >= 4);

      unsigned long*ap = vector_to_array(thr, adra, wid);
      if (ap == 0) {
	    vvp_vector4_t tmp(wid, BIT4_X);
	    thr->bits4.set_vec(adra, tmp);
	    return true;
      }

      unsigned long*bp = vector_to_array(thr, adrb, wid);
      if (bp == 0) {
	    delete[]ap;
	    vvp_vector4_t tmp(wid, BIT4_X);
	    thr->bits4.set_vec(adra, tmp);
	    return true;
      }

	// If the value fits in a single CPU word, then do it the easy way.
      if (wid <= CPU_WORD_BITS) {
	    ap[0] *= bp[0];
	    thr->bits4.setarray(adra, wid, ap);
	    delete[]ap;
	    delete[]bp;
	    return true;
      }

      unsigned words = (wid+CPU_WORD_BITS-1) / CPU_WORD_BITS;
      unsigned long*res = new unsigned long[words];
      for (unsigned idx = 0 ; idx < words ; idx += 1)
	    res[idx] = 0;

      for (unsigned mul_a = 0 ; mul_a < words ; mul_a += 1) {
	    for (unsigned mul_b = 0 ; mul_b < (words-mul_a) ; mul_b += 1) {
		  unsigned long sum;
		  unsigned long tmp = multiply_with_carry(ap[mul_a], bp[mul_b], sum);
		  unsigned base = mul_a + mul_b;
		  unsigned long carry = 0;
		  res[base] = add_with_carry(res[base], tmp, carry);
		  for (unsigned add_idx = base+1; add_idx < words; add_idx += 1) {
			res[add_idx] = add_with_carry(res[add_idx], sum, carry);
			sum = 0;
		  }
	    }
      }

      thr->bits4.setarray(adra, wid, res);
      delete[]ap;
      delete[]bp;
      delete[]res;
      return true;
}

bool of_MUL_WR(vthread_t thr, vvp_code_t cp)
{
      double l = thr->words[cp->bit_idx[0]].w_real;
      double r = thr->words[cp->bit_idx[1]].w_real;
      thr->words[cp->bit_idx[0]].w_real = l * r;

      return true;
}

bool of_MULI(vthread_t thr, vvp_code_t cp)
{
      unsigned adr = cp->bit_idx[0];
      unsigned long imm = cp->bit_idx[1];
      unsigned wid = cp->number;

      assert(adr >= 4);

      unsigned long*val = vector_to_array(thr, adr, wid);
	// If there are X bits in the value, then return X.
      if (val == 0) {
	    vvp_vector4_t tmp(cp->number, BIT4_X);
	    thr->bits4.set_vec(cp->bit_idx[0], tmp);
	    return true;
      }

	// If everything fits in a word, then do it the easy way.
      if (wid <= CPU_WORD_BITS) {
	    val[0] *= imm;
	    thr->bits4.setarray(adr, wid, val);
	    delete[]val;
	    return true;
      }

      unsigned words = (wid+CPU_WORD_BITS-1) / CPU_WORD_BITS;
      unsigned long*res = new unsigned long[words];

      multiply_array_imm(res, val, words, imm);

      thr->bits4.setarray(adr, wid, res);
      delete[]val;
      delete[]res;
      return true;
}

static bool of_NAND_wide(vthread_t thr, vvp_code_t cp)
{
      unsigned idx1 = cp->bit_idx[0];
      unsigned idx2 = cp->bit_idx[1];
      unsigned wid = cp->number;

      vvp_vector4_t val = vthread_bits_to_vector(thr, idx1, wid);
      val &= vthread_bits_to_vector(thr, idx2, wid);
      thr->bits4.set_vec(idx1, ~val);

      return true;
}

static bool of_NAND_narrow(vthread_t thr, vvp_code_t cp)
{
      unsigned idx1 = cp->bit_idx[0];
      unsigned idx2 = cp->bit_idx[1];
      unsigned wid = cp->number;

      for (unsigned idx = 0 ; idx < wid ; idx += 1) {
	    vvp_bit4_t lb = thr_get_bit(thr, idx1);
	    vvp_bit4_t rb = thr_get_bit(thr, idx2);
	    thr_put_bit(thr, idx1, ~(lb&rb));
	    idx1 += 1;
	    if (idx2 >= 4)
		  idx2 += 1;
      }

      return true;
}

bool of_NAND(vthread_t thr, vvp_code_t cp)
{
      assert(cp->bit_idx[0] >= 4);

      if (cp->number <= 4)
	    cp->opcode = &of_NAND_narrow;
      else
	    cp->opcode = &of_NAND_wide;

      return cp->opcode(thr, cp);
}


bool of_NOOP(vthread_t thr, vvp_code_t cp)
{
      return true;
}

bool of_NORR(vthread_t thr, vvp_code_t cp)
{
      assert(cp->bit_idx[0] >= 4);

      vvp_bit4_t lb = BIT4_1;
      unsigned idx2 = cp->bit_idx[1];

      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {

	    vvp_bit4_t rb = thr_get_bit(thr, idx2+idx);
	    if (rb == BIT4_1) {
		  lb = BIT4_0;
		  break;
	    }

	    if (rb != BIT4_0)
		  lb = BIT4_X;
      }

      thr_put_bit(thr, cp->bit_idx[0], lb);

      return true;
}

bool of_ANDR(vthread_t thr, vvp_code_t cp)
{
      assert(cp->bit_idx[0] >= 4);

      vvp_bit4_t lb = BIT4_1;
      unsigned idx2 = cp->bit_idx[1];

      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {

	    vvp_bit4_t rb = thr_get_bit(thr, idx2+idx);
	    if (rb == BIT4_0) {
		  lb = BIT4_0;
		  break;
	    }

	    if (rb != BIT4_1)
		  lb = BIT4_X;
      }

      thr_put_bit(thr, cp->bit_idx[0], lb);

      return true;
}

bool of_NANDR(vthread_t thr, vvp_code_t cp)
{
      assert(cp->bit_idx[0] >= 4);

      vvp_bit4_t lb = BIT4_0;
      unsigned idx2 = cp->bit_idx[1];

      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {

	    vvp_bit4_t rb = thr_get_bit(thr, idx2+idx);
	    if (rb == BIT4_0) {
		  lb = BIT4_1;
		  break;
	    }

	    if (rb != BIT4_1)
		  lb = BIT4_X;
      }

      thr_put_bit(thr, cp->bit_idx[0], lb);

      return true;
}

bool of_ORR(vthread_t thr, vvp_code_t cp)
{
      assert(cp->bit_idx[0] >= 4);

      vvp_bit4_t lb = BIT4_0;
      unsigned idx2 = cp->bit_idx[1];

      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {

	    vvp_bit4_t rb = thr_get_bit(thr, idx2+idx);
	    if (rb == BIT4_1) {
		  lb = BIT4_1;
		  break;
	    }

	    if (rb != BIT4_0)
		  lb = BIT4_X;
      }

      thr_put_bit(thr, cp->bit_idx[0], lb);

      return true;
}

bool of_XORR(vthread_t thr, vvp_code_t cp)
{
      assert(cp->bit_idx[0] >= 4);

      vvp_bit4_t lb = BIT4_0;
      unsigned idx2 = cp->bit_idx[1];

      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {

	    vvp_bit4_t rb = thr_get_bit(thr, idx2+idx);
	    if (rb == BIT4_1)
		  lb = ~lb;
	    else if (rb != BIT4_0) {
		  lb = BIT4_X;
		  break;
	    }
      }

      thr_put_bit(thr, cp->bit_idx[0], lb);

      return true;
}

bool of_XNORR(vthread_t thr, vvp_code_t cp)
{
      assert(cp->bit_idx[0] >= 4);

      vvp_bit4_t lb = BIT4_1;
      unsigned idx2 = cp->bit_idx[1];

      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {

	    vvp_bit4_t rb = thr_get_bit(thr, idx2+idx);
	    if (rb == BIT4_1)
		  lb = ~lb;
	    else if (rb != BIT4_0) {
		  lb = BIT4_X;
		  break;
	    }
      }

      thr_put_bit(thr, cp->bit_idx[0], lb);

      return true;
}

static bool of_OR_wide(vthread_t thr, vvp_code_t cp)
{
      unsigned idx1 = cp->bit_idx[0];
      unsigned idx2 = cp->bit_idx[1];
      unsigned wid = cp->number;

      vvp_vector4_t val = vthread_bits_to_vector(thr, idx1, wid);
      val |= vthread_bits_to_vector(thr, idx2, wid);
      thr->bits4.set_vec(idx1, val);

      return true;
}

static bool of_OR_narrow(vthread_t thr, vvp_code_t cp)
{
      unsigned idx1 = cp->bit_idx[0];
      unsigned idx2 = cp->bit_idx[1];
      unsigned wid = cp->number;

      for (unsigned idx = 0 ; idx < wid ; idx += 1) {
	    vvp_bit4_t lb = thr_get_bit(thr, idx1);
	    vvp_bit4_t rb = thr_get_bit(thr, idx2);
	    thr_put_bit(thr, idx1, lb|rb);
	    idx1 += 1;
	    if (idx2 >= 4)
		  idx2 += 1;
      }

      return true;
}

bool of_OR(vthread_t thr, vvp_code_t cp)
{
      assert(cp->bit_idx[0] >= 4);

      if (cp->number <= 4)
	    cp->opcode = &of_OR_narrow;
      else
	    cp->opcode = &of_OR_wide;

      return cp->opcode(thr, cp);
}

static bool of_NOR_wide(vthread_t thr, vvp_code_t cp)
{
      assert(cp->bit_idx[0] >= 4);

      unsigned idx1 = cp->bit_idx[0];
      unsigned idx2 = cp->bit_idx[1];
      unsigned wid = cp->number;

      vvp_vector4_t val = vthread_bits_to_vector(thr, idx1, wid);
      val |= vthread_bits_to_vector(thr, idx2, wid);
      thr->bits4.set_vec(idx1, ~val);

      return true;
}

static bool of_NOR_narrow(vthread_t thr, vvp_code_t cp)
{
      unsigned idx1 = cp->bit_idx[0];
      unsigned idx2 = cp->bit_idx[1];
      unsigned wid = cp->number;

      for (unsigned idx = 0 ; idx < wid ; idx += 1) {
	    vvp_bit4_t lb = thr_get_bit(thr, idx1);
	    vvp_bit4_t rb = thr_get_bit(thr, idx2);
	    thr_put_bit(thr, idx1, ~(lb|rb));
	    idx1 += 1;
	    if (idx2 >= 4)
		  idx2 += 1;
      }

      return true;
}

bool of_NOR(vthread_t thr, vvp_code_t cp)
{
      assert(cp->bit_idx[0] >= 4);

      if (cp->number <= 4)
	    cp->opcode = &of_NOR_narrow;
      else
	    cp->opcode = &of_NOR_wide;

      return cp->opcode(thr, cp);
}

bool of_POW(vthread_t thr, vvp_code_t cp)
{
      assert(cp->bit_idx[0] >= 4);

      unsigned idx = cp->bit_idx[0];
      unsigned idy = cp->bit_idx[1];
      unsigned wid = cp->number;
      vvp_vector2_t xv2 = vvp_vector2_t(vthread_bits_to_vector(thr, idx, wid));
      vvp_vector2_t yv2 = vvp_vector2_t(vthread_bits_to_vector(thr, idy, wid));

        /* If we have an X or Z in the arguments return X. */
      if (xv2.is_NaN() || yv2.is_NaN()) {
	    for (unsigned jdx = 0 ;  jdx < wid ;  jdx += 1)
		  thr_put_bit(thr, cp->bit_idx[0]+jdx, BIT4_X);
	    return true;
      }

        /* To make the result more manageable trim off the extra bits. */
      xv2.trim();
      yv2.trim();

      vvp_vector2_t result = pow(xv2, yv2);

        /* If the result is too small zero pad it. */
      if (result.size() < wid) {
	    for (unsigned jdx = wid-1;  jdx >= result.size();  jdx -= 1)
		  thr_put_bit(thr, cp->bit_idx[0]+jdx, BIT4_0);
	    wid = result.size();
      }

        /* Copy only what we need of the result. */
      for (unsigned jdx = 0;  jdx < wid;  jdx += 1)
	    thr_put_bit(thr, cp->bit_idx[0]+jdx,
	                result.value(jdx) ? BIT4_1 : BIT4_0);

      return true;
}

bool of_POW_S(vthread_t thr, vvp_code_t cp)
{
      assert(cp->bit_idx[0] >= 4);

      unsigned idx = cp->bit_idx[0];
      unsigned idy = cp->bit_idx[1];
      unsigned wid = cp->number;
      vvp_vector4_t xv = vthread_bits_to_vector(thr, idx, wid);
      vvp_vector4_t yv = vthread_bits_to_vector(thr, idy, wid);

        /* If we have an X or Z in the arguments return X. */
      if (xv.has_xz() || yv.has_xz()) {
	    for (unsigned jdx = 0 ;  jdx < wid ;  jdx += 1)
		  thr_put_bit(thr, cp->bit_idx[0]+jdx, BIT4_X);
	    return true;
      }

        /* Calculate the result using the double pow() function. */
      double xd, yd;
      vector4_to_value(xv, xd, true);
      vector4_to_value(yv, yd, true);
      vvp_vector4_t res = vvp_vector4_t(wid, pow(xd, yd));

        /* Copy the result. */
      for (unsigned jdx = 0;  jdx < wid;  jdx += 1)
	    thr_put_bit(thr, cp->bit_idx[0]+jdx, res.value(jdx));

      return true;
}

bool of_POW_WR(vthread_t thr, vvp_code_t cp)
{
      double l = thr->words[cp->bit_idx[0]].w_real;
      double r = thr->words[cp->bit_idx[1]].w_real;
      thr->words[cp->bit_idx[0]].w_real = pow(l, r);

      return true;
}

/*
 * These implement the %release/net and %release/reg instructions. The
 * %release/net instruction applies to a net kind of functor by
 * sending the release/net command to the command port. (See vvp_net.h
 * for details.) The %release/reg instruction is the same, but sends
 * the release/reg command instead. These are very similar to the
 * %deassign instruction.
 */
static bool do_release_vec(vthread_t thr, vvp_code_t cp, bool net_flag)
{
      vvp_net_t*net = cp->net;
      unsigned base  = cp->bit_idx[0];
      unsigned width = cp->bit_idx[1];

      assert(net->fil);

      if (base >= net->fil->filter_size()) return true;
      if (base+width > net->fil->filter_size())
	    width = net->fil->filter_size() - base;

      bool full_sig = base == 0 && width == net->fil->filter_size();

	// XXXX Can't really do this if this is a partial release?
      net->fil->force_unlink();

	/* Do we release all or part of the net? */
      vvp_net_ptr_t ptr (net, 0);
      if (full_sig) {
	    net->fil->release(ptr, net_flag);
      } else {
	    net->fil->release_pv(ptr, base, width, net_flag);
      }

      return true;
}

bool of_RELEASE_NET(vthread_t thr, vvp_code_t cp)
{
      return do_release_vec(thr, cp, true);
}


bool of_RELEASE_REG(vthread_t thr, vvp_code_t cp)
{
      return do_release_vec(thr, cp, false);
}

/* The type is 1 for registers and 0 for everything else. */
bool of_RELEASE_WR(vthread_t thr, vvp_code_t cp)
{
      vvp_net_t*net = cp->net;
      unsigned type  = cp->bit_idx[0];

      assert(net->fil);
      net->fil->force_unlink();

	// Send a command to this signal to unforce itself.
      vvp_net_ptr_t ptr (net, 0);
      net->fil->release(ptr, type==0);
      return true;
}

/*
 * %set/av <label>, <index>, <bit>
 *
 * Write the real value in register <bit> to the array indexed by the
 * integer value addressed bin index register <index>.
 */
bool of_SET_AR(vthread_t thr, vvp_code_t cp)
{
      unsigned idx = cp->bit_idx[0];
      unsigned bit = cp->bit_idx[1];
      unsigned adr = thr->words[idx].w_int;

      double value = thr->words[bit].w_real;
      array_set_word(cp->array, adr, value);

      return true;
}

/*
 * This implements the "%set/av <label>, <bit>, <wid>" instruction. In
 * this case, the <label> is an array label, and the <bit> and <wid>
 * are the thread vector of a value to be written in.
 */
bool of_SET_AV(vthread_t thr, vvp_code_t cp)
{
      unsigned bit = cp->bit_idx[0];
      unsigned wid = cp->bit_idx[1];
      unsigned off = thr->words[1].w_int;
      unsigned adr = thr->words[3].w_int;

	/* Make a vector of the desired width. */
      vvp_vector4_t value = vthread_bits_to_vector(thr, bit, wid);

      array_set_word(cp->array, adr, off, value);
      return true;
}


/*
 * This implements the "%set/v <label>, <bit>, <wid>" instruction.
 *
 * The <label> is a reference to a vvp_net_t object, and it is in
 * cp->net.
 *
 * The <bit> is the thread bit address, and is in cp->bin_idx[0].
 *
 * The <wid> is the width of the vector I'm to make, and is in
 * cp->bin_idx[1].
 */
bool of_SET_VEC(vthread_t thr, vvp_code_t cp)
{
      assert(cp->bit_idx[1] > 0);
      unsigned bit = cp->bit_idx[0];
      unsigned wid = cp->bit_idx[1];

	/* set the value into port 0 of the destination. */
      vvp_net_ptr_t ptr (cp->net, 0);

      vvp_send_vec4(ptr, vthread_bits_to_vector(thr, bit, wid),
                    thr->wt_context);

      return true;
}

bool of_SET_WORDR(vthread_t thr, vvp_code_t cp)
{
	/* set the value into port 0 of the destination. */
      vvp_net_ptr_t ptr (cp->net, 0);

      vvp_send_real(ptr, thr->words[cp->bit_idx[0]].w_real, thr->wt_context);

      return true;
}

/*
 * Implement the %set/x instruction:
 *
 *      %set/x <functor>, <bit>, <wid>
 *
 * The bit value of a vector go into the addressed functor. Do not
 * transfer bits that are outside the signal range. Get the target
 * vector dimensions from the vvp_fun_signal addressed by the vvp_net
 * pointer.
 */
bool of_SET_X0(vthread_t thr, vvp_code_t cp)
{
      vvp_net_t*net = cp->net;
      unsigned bit = cp->bit_idx[0];
      unsigned wid = cp->bit_idx[1];

	// Implicitly, we get the base into the target vector from the
	// X0 register.
      long index = thr->words[0].w_int;

      vvp_signal_value*sig = dynamic_cast<vvp_signal_value*> (net->fil);
      assert(sig);

	// If the entire part is below the beginning of the vector,
	// then we are done.
      if (index < 0 && (wid <= (unsigned)-index))
	    return true;

	// If the entire part is above then end of the vector, then we
	// are done.
      if (index >= (long)sig->value_size())
	    return true;

	// If the part starts below the vector, then skip the first
	// few bits and reduce enough bits to start at the beginning
	// of the vector.
      if (index < 0) {
	    if (bit >= 4) bit += (unsigned) -index;
	    wid -= (unsigned) -index;
	    index = 0;
      }

	// Reduce the width to keep the part inside the vector.
      if (index+wid > sig->value_size())
	    wid = sig->value_size() - index;

      vvp_vector4_t bit_vec(wid);
      for (unsigned idx = 0 ;  idx < wid ;  idx += 1) {
	    vvp_bit4_t bit_val = thr_get_bit(thr, bit);
	    bit_vec.set_bit(idx, bit_val);
	    if (bit >= 4)
		  bit += 1;
      }

      vvp_net_ptr_t ptr (net, 0);
      vvp_send_vec4_pv(ptr, bit_vec, index, wid, sig->value_size(), thr->wt_context);

      return true;
}

bool of_SHIFTL_I0(vthread_t thr, vvp_code_t cp)
{
      unsigned base = cp->bit_idx[0];
      unsigned wid = cp->number;
      long shift = thr->words[0].w_int;

      assert(base >= 4);
      thr_check_addr(thr, base+wid-1);

      if (thr_get_bit(thr, 4) == BIT4_1) {
	    // The result is 'bx if the shift amount is undefined.
	    vvp_vector4_t tmp (wid, BIT4_X);
	    thr->bits4.set_vec(base, tmp);

      } else if (shift >= (long)wid) {
	      // Shift is so far that all value is shifted out. Write
	      // in a constant 0 result.
	    vvp_vector4_t tmp (wid, BIT4_0);
	    thr->bits4.set_vec(base, tmp);

      } else if (shift > 0) {
	    vvp_vector4_t tmp (thr->bits4, base, wid-shift);
	    thr->bits4.set_vec(base+shift, tmp);

	      // Fill zeros on the bottom
	    vvp_vector4_t fil (shift, BIT4_0);
	    thr->bits4.set_vec(base, fil);

      } else if (shift < 0) {
	      // For a negative shift we pad with 'bx.
	    unsigned idx;
	    for (idx = 0 ;  (idx-shift) < wid ;  idx += 1) {
		  unsigned src = base + idx - shift;
		  unsigned dst = base + idx;
		  thr_put_bit(thr, dst, thr_get_bit(thr, src));
	    }
	    for ( ;  idx < wid ;  idx += 1)
		  thr_put_bit(thr, base+idx, BIT4_X);
      }
      return true;
}

/*
 * This is an unsigned right shift:
 *
 *    %shiftr/i0 <bit>, <wid>
 *
 * The vector at address <bit> with width <wid> is shifted right a
 * number of bits stored in index/word register 0.
 */
bool of_SHIFTR_I0(vthread_t thr, vvp_code_t cp)
{
      unsigned base = cp->bit_idx[0];
      unsigned wid = cp->number;
      long shift = thr->words[0].w_int;

      assert(base >= 4);
      thr_check_addr(thr, base+wid-1);

      if (thr_get_bit(thr, 4) == BIT4_1) {
	      // The result is 'bx if the shift amount is undefined.
	    vvp_vector4_t tmp (wid, BIT4_X);
	    thr->bits4.set_vec(base, tmp);

      } else if (shift > 0) {
	    unsigned idx;
	    for (idx = 0 ;  (idx+shift) < wid ;  idx += 1) {
		  unsigned src = base + idx + shift;
		  unsigned dst = base + idx;
		  thr_put_bit(thr, dst, thr_get_bit(thr, src));
	    }
	    for ( ;  idx < wid ;  idx += 1)
		  thr_put_bit(thr, base+idx, BIT4_0);

      } else if (shift < -(long)wid) {
	      // Negative shift is so far that all the value is shifted out.
	      // Write in a constant 'bx result.
	    vvp_vector4_t tmp (wid, BIT4_X);
	    thr->bits4.set_vec(base, tmp);

      } else if (shift < 0) {

	      // For a negative shift we pad with 'bx.
	    vvp_vector4_t tmp (thr->bits4, base, wid+shift);
	    thr->bits4.set_vec(base-shift, tmp);

	    vvp_vector4_t fil (-shift, BIT4_X);
	    thr->bits4.set_vec(base, fil);
      }
      return true;
}

bool of_SHIFTR_S_I0(vthread_t thr, vvp_code_t cp)
{
      unsigned base = cp->bit_idx[0];
      unsigned wid = cp->number;
      unsigned long shift = thr->words[0].w_int;
      vvp_bit4_t sign = thr_get_bit(thr, base+wid-1);

      if (thr_get_bit(thr, 4) == BIT4_1) {
	      // The result is 'bx if the shift amount is undefined.
	    vvp_vector4_t tmp (wid, BIT4_X);
	    thr->bits4.set_vec(base, tmp);
      } else if (shift >= wid) {
	    for (unsigned idx = 0 ;  idx < wid ;  idx += 1)
		  thr_put_bit(thr, base+idx, sign);

      } else if (shift > 0) {
	    for (unsigned idx = 0 ;  idx < (wid-shift) ;  idx += 1) {
		  unsigned src = base + idx + shift;
		  unsigned dst = base + idx;
		  thr_put_bit(thr, dst, thr_get_bit(thr, src));
	    }
	    for (unsigned idx = (wid-shift) ;  idx < wid ;  idx += 1)
		  thr_put_bit(thr, base+idx, sign);
      }
      return true;
}

bool of_SUB(vthread_t thr, vvp_code_t cp)
{
      assert(cp->bit_idx[0] >= 4);

      unsigned long*lva = vector_to_array(thr, cp->bit_idx[0], cp->number);
      unsigned long*lvb = vector_to_array(thr, cp->bit_idx[1], cp->number);
      if (lva == 0 || lvb == 0)
	    goto x_out;


      unsigned long carry;
      carry = 1;
      for (unsigned idx = 0 ;  (idx*CPU_WORD_BITS) < cp->number ;  idx += 1)
	    lva[idx] = add_with_carry(lva[idx], ~lvb[idx], carry);


	/* We know from the vector_to_array that the address is valid
	   in the thr->bitr4 vector, so just do the set bit. */

      thr->bits4.setarray(cp->bit_idx[0], cp->number, lva);
      delete[]lva;
      delete[]lvb;

      return true;

 x_out:
      delete[]lva;
      delete[]lvb;

      vvp_vector4_t tmp(cp->number, BIT4_X);
      thr->bits4.set_vec(cp->bit_idx[0], tmp);

      return true;
}

bool of_SUB_WR(vthread_t thr, vvp_code_t cp)
{
      double l = thr->words[cp->bit_idx[0]].w_real;
      double r = thr->words[cp->bit_idx[1]].w_real;
      thr->words[cp->bit_idx[0]].w_real = l - r;
      return true;
}

bool of_SUBI(vthread_t thr, vvp_code_t cp)
{
      assert(cp->bit_idx[0] >= 4);

      unsigned word_count = (cp->number+CPU_WORD_BITS-1)/CPU_WORD_BITS;
      unsigned long imm = cp->bit_idx[1];
      unsigned long*lva = vector_to_array(thr, cp->bit_idx[0], cp->number);
      if (lva == 0)
	    goto x_out;


      unsigned long carry;
      carry = 1;
      for (unsigned idx = 0 ;  idx < word_count ;  idx += 1) {
	    lva[idx] = add_with_carry(lva[idx], ~imm, carry);
	    imm = 0UL;
      }

	/* We know from the vector_to_array that the address is valid
	   in the thr->bitr4 vector, so just do the set bit. */

      thr->bits4.setarray(cp->bit_idx[0], cp->number, lva);

      delete[]lva;

      return true;

 x_out:
      delete[]lva;

      vvp_vector4_t tmp(cp->number, BIT4_X);
      thr->bits4.set_vec(cp->bit_idx[0], tmp);

      return true;
}

bool of_VPI_CALL(vthread_t thr, vvp_code_t cp)
{
      vpip_execute_vpi_call(thr, cp->handle);

      if (schedule_stopped()) {
	    if (! schedule_finished())
		  schedule_vthread(thr, 0, false);

	    return false;
      }

      return schedule_finished()? false : true;
}

/* %wait <label>;
 * Implement the wait by locating the vvp_net_T for the event, and
 * adding this thread to the threads list for the event. The some
 * argument is the  reference to the functor to wait for. This must be
 * an event object of some sort.
 */
bool of_WAIT(vthread_t thr, vvp_code_t cp)
{
      assert(! thr->waiting_for_event);
      thr->waiting_for_event = 1;

	/* Add this thread to the list in the event. */
      waitable_hooks_s*ep = dynamic_cast<waitable_hooks_s*> (cp->net->fun);
      assert(ep);
      thr->wait_next = ep->add_waiting_thread(thr);

	/* Return false to suspend this thread. */
      return false;
}


bool of_XNOR(vthread_t thr, vvp_code_t cp)
{
      assert(cp->bit_idx[0] >= 4);

      unsigned idx1 = cp->bit_idx[0];
      unsigned idx2 = cp->bit_idx[1];

      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {

	    vvp_bit4_t lb = thr_get_bit(thr, idx1);
	    vvp_bit4_t rb = thr_get_bit(thr, idx2);
	    thr_put_bit(thr, idx1, ~(lb ^ rb));

	    idx1 += 1;
	    if (idx2 >= 4)
		  idx2 += 1;
      }

      return true;
}


bool of_XOR(vthread_t thr, vvp_code_t cp)
{
      assert(cp->bit_idx[0] >= 4);

      unsigned idx1 = cp->bit_idx[0];
      unsigned idx2 = cp->bit_idx[1];

      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {

	    vvp_bit4_t lb = thr_get_bit(thr, idx1);
	    vvp_bit4_t rb = thr_get_bit(thr, idx2);

	    if ((lb == BIT4_1) && (rb == BIT4_1)) {
		  thr_put_bit(thr, idx1, BIT4_0);

	    } else if ((lb == BIT4_0) && (rb == BIT4_0)) {
		  thr_put_bit(thr, idx1, BIT4_0);

	    } else if ((lb == BIT4_1) && (rb == BIT4_0)) {
		  thr_put_bit(thr, idx1, BIT4_1);

	    } else if ((lb == BIT4_0) && (rb == BIT4_1)) {
		  thr_put_bit(thr, idx1, BIT4_1);

	    } else {
		  thr_put_bit(thr, idx1, BIT4_X);
	    }

	    idx1 += 1;
	    if (idx2 >= 4)
		  idx2 += 1;
      }

      return true;
}


bool of_ZOMBIE(vthread_t thr, vvp_code_t)
{
      thr->pc = codespace_null();
      if ((thr->parent == 0) && (thr->child == 0))
	    schedule_del_thr(thr);

      return false;
}

/*
 * This is a phantom opcode used to call user defined functions. It
 * is used in code generated by the .ufunc statement. It contains a
 * pointer to the executable code of the function and a pointer to
 * a ufunc_core object that has all the port information about the
 * function.
 */
bool of_EXEC_UFUNC(vthread_t thr, vvp_code_t cp)
{
      struct __vpiScope*child_scope = cp->ufunc_core_ptr->func_scope();
      assert(child_scope);

      assert(thr->child == 0);
      assert(thr->fork_count == 0);

        /* We can take a number of shortcuts because we know that a
           continuous assignment can only occur in a static scope. */
      assert(thr->wt_context == 0);
      assert(thr->rd_context == 0);

        /* If an automatic function, allocate a context for this call. */
      vvp_context_t child_context = 0;
      if (child_scope->is_automatic) {
            child_context = vthread_alloc_context(child_scope);
            thr->wt_context = child_context;
            thr->rd_context = child_context;
      }
	/* Copy all the inputs to the ufunc object to the port
	   variables of the function. This copies all the values
	   atomically. */
      cp->ufunc_core_ptr->assign_bits_to_ports(child_context);

	/* Create a temporary thread and run it immediately. A function
           may not contain any blocking statements, so vthread_run() can
           only return when the %end opcode is reached. */
      vthread_t child = vthread_new(cp->cptr, child_scope);
      child->wt_context = child_context;
      child->rd_context = child_context;
      child->is_scheduled = 1;
      vthread_run(child);
      running_thread = thr;

	/* Now copy the output from the result variable to the output
	   ports of the .ufunc device. */
      cp->ufunc_core_ptr->finish_thread(thr);

        /* If an automatic function, free the context for this call. */
      if (child_scope->is_automatic) {
            vthread_free_context(child_context, child_scope);
            thr->wt_context = 0;
            thr->rd_context = 0;
      }

      return true;
}