New vCPU instructions 2.0

Using, learning, programming and modding the Gigatron and anything related.
Forum rules
Be nice. No drama.
lb3361
Posts: 77
Joined: 17 Feb 2021, 23:07

Re: New vCPU instructions 2.0

Post by lb3361 »

I have about 10% reduction in code size, but I am only using a subset of the new instructions as I do not know their opcodes and I cannot test.

I also read you have a number of interesting sys calls. I am hoping to use as many of them as possible, then maybe propose new ones. For instance, memcpy() https://github.com/lb3361/gigatron-lcc/ ... c/memcpy.s is ready for using a SYS_CopyMemory that resembles the SYS_SetMemory that memset() already uses. See also _memscan.s in the same subdir. I am also planning a SYS_CopyMemoryExt that writes into another memory bank, something that is very hard to do with the vCPU alone.

Note that I distinguish libc from the runtime support routines that implement mul, div, shifts, long int support, float support. Those are in https://github.com/lb3361/gigatron-lcc/ ... on/runtime. The rely on the traditional stack with push and pop. They use 0x81-0x88 for the accumulator LAC or FAC. They temporarily can 0x88-0x8F and sysFn/sysArgs, but do not rely on them being preserved between calls. I am doing it this way to augment the chances of sharing code for all this. This is almost complete and passes a test suite. I just need one or two nights of insomnia to finish the fp support.
at67
Posts: 383
Joined: 14 May 2018, 08:29

Re: New vCPU instructions 2.0

Post by at67 »

My bad, I completely forgot about the opcodes, here's the current list:

Code: Select all

_asmOpcodes["LDWI"  ] = {0x11, 0x00, ThreeBytes, vCpu};
_asmOpcodes["DEC"   ] = {0x14, 0x00, TwoBytes,   vCpu};
_asmOpcodes["MOVQ"  ] = {0x16, 0x00, ThreeBytes, vCpu};
_asmOpcodes["LSRB"  ] = {0x18, 0x00, TwoBytes,   vCpu};
_asmOpcodes["LD"    ] = {0x1A, 0x00, TwoBytes,   vCpu};
_asmOpcodes["SEXT"  ] = {0x1C, 0x00, TwoBytes,   vCpu};
_asmOpcodes["CMPHS" ] = {0x1F, 0x00, TwoBytes,   vCpu};
_asmOpcodes["LDW"   ] = {0x21, 0x00, TwoBytes,   vCpu};
_asmOpcodes["PEEK+" ] = {0x23, 0x00, TwoBytes,   vCpu};
_asmOpcodes["POKEI" ] = {0x25, 0x00, TwoBytes,   vCpu};
_asmOpcodes["LSLV"  ] = {0x27, 0x00, TwoBytes,   vCpu};
_asmOpcodes["ADDBA" ] = {0x29, 0x00, TwoBytes,   vCpu};
_asmOpcodes["STW"   ] = {0x2B, 0x00, TwoBytes,   vCpu};
_asmOpcodes["ADDBI" ] = {0x2D, 0x00, ThreeBytes, vCpu};
_asmOpcodes["DBNE"  ] = {0x32, 0x00, ThreeBytes, vCpu};
_asmOpcodes["DOKEI" ] = {0x37, 0x00, ThreeBytes, vCpu};
_asmOpcodes["PEEKV" ] = {0x39, 0x00, TwoBytes,   vCpu};
_asmOpcodes["DEEKV" ] = {0x3B, 0x00, TwoBytes,   vCpu};
_asmOpcodes["XORBI" ] = {0x3D, 0x00, ThreeBytes, vCpu};
_asmOpcodes["ANDBA" ] = {0x42, 0x00, TwoBytes,   vCpu};
_asmOpcodes["ORBA"  ] = {0x44, 0x00, TwoBytes,   vCpu};
_asmOpcodes["XORBA" ] = {0x46, 0x00, TwoBytes,   vCpu};
_asmOpcodes["NOTB"  ] = {0x48, 0x00, TwoBytes,   vCpu};
_asmOpcodes["DOKE+" ] = {0x4A, 0x00, TwoBytes,   vCpu};
_asmOpcodes["LDI"   ] = {0x59, 0x00, TwoBytes,   vCpu};
_asmOpcodes["MOVQW" ] = {0x5B, 0x00, ThreeBytes, vCpu};
_asmOpcodes["ST"    ] = {0x5E, 0x00, TwoBytes,   vCpu};
_asmOpcodes["DEEK+" ] = {0x60, 0x00, TwoBytes,   vCpu};
_asmOpcodes["POP"   ] = {0x63, 0x00, OneByte,    vCpu};
_asmOpcodes["MOV"   ] = {0x65, 0x00, ThreeBytes, vCpu};
_asmOpcodes["PEEKA" ] = {0x67, 0x00, TwoBytes,   vCpu};
_asmOpcodes["POKEA" ] = {0x69, 0x00, TwoBytes,   vCpu};
_asmOpcodes["TEQ"   ] = {0x6B, 0x00, TwoBytes,   vCpu};
_asmOpcodes["TNE"   ] = {0x6D, 0x00, TwoBytes,   vCpu};
_asmOpcodes["DEEKA" ] = {0x6F, 0x00, TwoBytes,   vCpu};
_asmOpcodes["PUSH"  ] = {0x75, 0x00, OneByte,    vCpu};
_asmOpcodes["SUBBA" ] = {0x77, 0x00, TwoBytes,   vCpu};
_asmOpcodes["INCW"  ] = {0x79, 0x00, TwoBytes,   vCpu};
_asmOpcodes["DECW"  ] = {0x7B, 0x00, TwoBytes,   vCpu};
_asmOpcodes["DOKEA" ] = {0x7D, 0x00, TwoBytes,   vCpu};
_asmOpcodes["LUP"   ] = {0x7F, 0x00, TwoBytes,   vCpu};
_asmOpcodes["ANDI"  ] = {0x82, 0x00, TwoBytes,   vCpu};
_asmOpcodes["CALLI" ] = {0x85, 0x00, ThreeBytes, vCpu};
_asmOpcodes["ORI"   ] = {0x88, 0x00, TwoBytes,   vCpu};
_asmOpcodes["NOTW"  ] = {0x8A, 0x00, TwoBytes,   vCpu};
_asmOpcodes["XORI"  ] = {0x8C, 0x00, TwoBytes,   vCpu};
_asmOpcodes["DBGE"  ] = {0x8E, 0x00, ThreeBytes, vCpu};
_asmOpcodes["BRA"   ] = {0x90, 0x00, TwoBytes,   vCpu};
_asmOpcodes["INC"   ] = {0x93, 0x00, TwoBytes,   vCpu};
_asmOpcodes["ORBI"  ] = {0x95, 0x00, ThreeBytes, vCpu};
_asmOpcodes["CMPHU" ] = {0x97, 0x00, TwoBytes,   vCpu};
_asmOpcodes["ADDW"  ] = {0x99, 0x00, TwoBytes,   vCpu};
_asmOpcodes["LDNI"  ] = {0x9C, 0x00, TwoBytes,   vCpu};
_asmOpcodes["ANDBK" ] = {0x9E, 0x00, ThreeBytes, vCpu};
_asmOpcodes["ORBK"  ] = {0xA0, 0x00, ThreeBytes, vCpu};
_asmOpcodes["XORBK" ] = {0xA2, 0x00, ThreeBytes, vCpu};
_asmOpcodes["PEEKA+"] = {0xA4, 0x00, TwoBytes,   vCpu};
_asmOpcodes["CMPI"  ] = {0xA7, 0x00, ThreeBytes, vCpu};
_asmOpcodes["PEEK"  ] = {0xAD, 0x00, OneByte,    vCpu};
_asmOpcodes["SYS"   ] = {0xB4, 0x00, TwoBytes,   vCpu};
_asmOpcodes["SUBW"  ] = {0xB8, 0x00, TwoBytes,   vCpu};
_asmOpcodes["JEQ"   ] = {0xBB, 0x00, ThreeBytes, vCpu};
_asmOpcodes["JNE"   ] = {0xBD, 0x00, ThreeBytes, vCpu};
_asmOpcodes["JLT"   ] = {0xBF, 0x00, ThreeBytes, vCpu};
_asmOpcodes["JGT"   ] = {0xC1, 0x00, ThreeBytes, vCpu};
_asmOpcodes["JLE"   ] = {0xC3, 0x00, ThreeBytes, vCpu};
_asmOpcodes["JGE"   ] = {0xC5, 0x00, ThreeBytes, vCpu};
_asmOpcodes["DEF"   ] = {0xCD, 0x00, TwoBytes,   vCpu};
_asmOpcodes["CALL"  ] = {0xCF, 0x00, TwoBytes,   vCpu};
_asmOpcodes["POKE+" ] = {0xD1, 0x00, TwoBytes,   vCpu};
_asmOpcodes["NEGW"  ] = {0xD3, 0x00, TwoBytes,   vCpu};
_asmOpcodes["TGE"   ] = {0xD5, 0x00, TwoBytes,   vCpu};
_asmOpcodes["TLT"   ] = {0xD7, 0x00, TwoBytes,   vCpu};
_asmOpcodes["TGT"   ] = {0xD9, 0x00, TwoBytes,   vCpu};
_asmOpcodes["TLE"   ] = {0xDB, 0x00, TwoBytes,   vCpu};
_asmOpcodes["ANDBI" ] = {0xDD, 0x00, ThreeBytes, vCpu};
_asmOpcodes["ALLOC" ] = {0xDF, 0x00, TwoBytes,   vCpu};
_asmOpcodes["SUBBI" ] = {0xE1, 0x00, ThreeBytes, vCpu};
_asmOpcodes["ADDI"  ] = {0xE3, 0x00, TwoBytes,   vCpu};
_asmOpcodes["SUBI"  ] = {0xE6, 0x00, TwoBytes,   vCpu};
_asmOpcodes["LSLW"  ] = {0xE9, 0x00, OneByte,    vCpu};
_asmOpcodes["STLW"  ] = {0xEC, 0x00, TwoBytes,   vCpu};
_asmOpcodes["LDLW"  ] = {0xEE, 0x00, TwoBytes,   vCpu};
_asmOpcodes["POKE"  ] = {0xF0, 0x00, TwoBytes,   vCpu};
_asmOpcodes["DOKE"  ] = {0xF3, 0x00, TwoBytes,   vCpu};
_asmOpcodes["DEEK"  ] = {0xF6, 0x00, OneByte,    vCpu};
_asmOpcodes["ANDW"  ] = {0xF8, 0x00, TwoBytes,   vCpu};
_asmOpcodes["ORW"   ] = {0xFA, 0x00, TwoBytes,   vCpu};
_asmOpcodes["XORW"  ] = {0xFC, 0x00, TwoBytes,   vCpu};
_asmOpcodes["RET"   ] = {0xFF, 0x00, OneByte,    vCpu};

// Psuedo vCPU instructions
_asmOpcodes["HALT"  ] = {0xB4, 0x80, TwoBytes,   vCpu};

// PREFX3 vCPU instructions
_asmOpcodes["ST2"   ] = {0xC7, 0x11, FourBytes, vCpu};
_asmOpcodes["STW2"  ] = {0xC7, 0x14, FourBytes, vCpu};
_asmOpcodes["XCHG"  ] = {0xC7, 0x17, FourBytes, vCpu};
_asmOpcodes["MOVW"  ] = {0xC7, 0x19, FourBytes, vCpu};
_asmOpcodes["ADDWI" ] = {0xC7, 0x1B, FourBytes, vCpu};
_asmOpcodes["SUBWI" ] = {0xC7, 0x1D, FourBytes, vCpu};
_asmOpcodes["ANDWI" ] = {0xC7, 0x1F, FourBytes, vCpu};
_asmOpcodes["ORWI"  ] = {0xC7, 0x21, FourBytes, vCpu};
_asmOpcodes["XORWI" ] = {0xC7, 0x23, FourBytes, vCpu};
_asmOpcodes["LDPX"  ] = {0xC7, 0x25, FourBytes, vCpu};
_asmOpcodes["STPX"  ] = {0xC7, 0x28, FourBytes, vCpu};
_asmOpcodes["CONDI" ] = {0xC7, 0x2B, FourBytes, vCpu};
_asmOpcodes["CONDB" ] = {0xC7, 0x2D, FourBytes, vCpu};
_asmOpcodes["CONDIB"] = {0xC7, 0x30, FourBytes, vCpu};
_asmOpcodes["CONDBI"] = {0xC7, 0x33, FourBytes, vCpu};

// Gigatron vCPU branch instructions
_asmOpcodes["BEQ"] = {0x35, 0x3F, ThreeBytes, vCpu};
_asmOpcodes["BGT"] = {0x35, 0x4D, ThreeBytes, vCpu};
_asmOpcodes["BLT"] = {0x35, 0x50, ThreeBytes, vCpu};
_asmOpcodes["BGE"] = {0x35, 0x53, ThreeBytes, vCpu};
_asmOpcodes["BLE"] = {0x35, 0x56, ThreeBytes, vCpu};
_asmOpcodes["BNE"] = {0x35, 0x72, ThreeBytes, vCpu};
I'll post more information on the Sys calls tonight.
lb3361
Posts: 77
Joined: 17 Feb 2021, 23:07

Re: New vCPU instructions 2.0

Post by lb3361 »

I also need to know the order of the operands. For instance, ORBI(x,y) is x the address aa and y the immediate II? Which one comes first in the encoding: 95 aa ii or 95 ii aa? Plus I need to test. This is why I prefer to wait for the rom source.
lb3361
Posts: 77
Joined: 17 Feb 2021, 23:07

Re: New vCPU instructions 2.0

Post by lb3361 »

Ideas for prefix2.

I now have long and fp support in the C compiler. One of the issues is that code becomes very bulky because moving data around requires a lot of instructions. For instance moving a long between registers requires 2 ldw and 2 stw (8 bytes) and moving a float requires another 4 bytes for the exponent. Replacing these instructions by a little routine is not much of a saving because preparing the arguments (src and dst address) still makes for bulky call sequences. This bulky code means that routines do not easily fit in the 96 bytes that follow a scanline, which causes another layer of slowdowns.

So, before even thinking of supporting long and float arithmetic with SYS calls, it would be desirable to have MOVL v1,v2 (moving 4 bytes) and MOVF v1,v2 (moving 5 bytes), and maybe even LOKEA/LEEKA and fp equivalents (FOKEA might not be a good name.) Doing this might involve writing instructions that self restart like SYS. This might seem slow, but the code size benefits are substantial enough.

I am therfore eager to see your rom to propose such prefix2 instructions.


- L.
at67
Posts: 383
Joined: 14 May 2018, 08:29

Re: New vCPU instructions 2.0

Post by at67 »

lb3361 wrote: 13 Jun 2021, 00:48 Ideas for prefix2.

So, before even thinking of supporting long and float arithmetic with SYS calls, it would be desirable to have MOVL v1,v2 (moving 4 bytes) and MOVF v1,v2 (moving 5 bytes), and maybe even LOKEA/LEEKA and fp equivalents (FOKEA might not be a good name.) Doing this might involve writing instructions that self restart like SYS. This might seem slow, but the code size benefits are substantial enough.
Wouldn't MOVL v1, v2 and MOVF v1, v2 require PREFX3?

I've added the following PREFX2 instruction, (which can potentially save a lot of code space if left shifting by N multiple times).

This instruction can perform 'n' left shifts from 0 to 255 times, (0 and >15 don't have much use), all with 3 bytes of code space. It is very slightly slower than a corresponding sequence of LSLW instructions when 'n' >= 4, but potentially uses much less code space.

The cool thing about this instruction is that it self restarts the PREFX2 payload, so after the PREFX2 header is executed, the PREFX2 payload, (left shift instruction), is executed 'n' times.
  • LSLN <imm>, vAC <<= 8bit imm, 26 + 30*n + 20 cycles.

Code: Select all

# pc = 0x032f, Opcode = 0x2f
# Instruction PREFX2
label('PREFX2')
ld(hi('prefx2#13'),Y)           #10 #12
jmp(Y,'prefx2#13')              #11
ld(0x22)                        #12 ENTER is at $(n-1)ff, where n = instruction page

# PREFX2 implementation
label('prefx2#13')
st([vCpuSelect])                #13
ld([vPC])                       #14
adda(1)                         #15
st([vPC])                       #16 Advance vPC
adda(1,X)                       #17
ld([vPC+1],Y)                   #18
ld([Y,X])                       #19
st([sysArgs+7])                 #20 Operand
ld([vCpuSelect])                #21
adda(1,Y)                       #22
jmp(Y,'NEXTY')                  #23
ld(-26/2)                       #24


#-----------------------------------------------------------------------
#       PREFX2 instruction page, (0x2300)
#-----------------------------------------------------------------------
#
bra('.next2')                   #0 Enter at '.next2' (so no startup overhead)
# --- Page boundary ---
align(0x100,size=0x100)
ld([vPC+1],Y)                   #1

# Fetch next instruction and execute it, but only if there are sufficient
# ticks left for the slowest instruction.
adda([vTicks])                  #0 Track elapsed ticks (actually counting down: AC<0)
blt('EXIT')                     #1 Escape near time out
st([vTicks])                    #2
ld([vPC],X)                     #3 PREFIX is 1 byte, vPC has been incremented by 1
nop()                           #4
st(vCpuSelect,[vCpuSelect])     #5 Reset to default vCPU page
ld([Y,X])                       #6 Fetch opcode (actually a branch target)
st([Y,Xpp])                     #7 Just X++
bra(AC)                         #8 Dispatch
ld([Y,X])                       #9 Prefetch operand

# Resync with video driver and transfer control
adda(maxTicks)                  #3
bgt(pc()&255)                   #4 Resync
suba(1)                         #5
ld(hi('vBlankStart'),Y)         #6
jmp(Y,[vReturn])                #7 To video driver
ld(0)                           #8 AC should be 0 already. Still..
assert vCPU_overhead ==          9

# pc = 0x2311, Opcode = 0x11
# Instruction LSLN
label('LSLN')
ld(hi('lsln#13'),Y)             #10
jmp(Y,'lsln#13')                #11
ld([sysArgs+7])                 #12 number of bits to shift


#-----------------------------------------------------------------------
#       PREFX2 implementation page, (0x2600)
#-----------------------------------------------------------------------
#
# LSLN implementation
label('lsln#13')
suba(1)                         #13
bge('.lsln#16')                 #14
st([sysArgs+7])                 #15
ld(hi('NEXTY'),Y)               #16 exit PREFX2 instruction page
jmp(Y,'NEXTY')                  #17
ld(-20/2)                       #18
label('.lsln#16')
ld([vAC])                       #16
anda(128,X)                     #17
adda([vAC])                     #18
st([vAC])                       #19
ld([X])                         #20
adda([vAC+1])                   #21
adda([vAC+1])                   #22
st([vAC+1])                     #23
ld(0x22)                        #24 ENTER is at $(n-1)ff, where n = instruction page
st([vCpuSelect])                #25 restore PREFX2 insruction page
adda(1,Y)                       #26 restart instruction
jmp(Y,'NEXTY')                  #27
ld(-30/2)                       #28
lb3361
Posts: 77
Joined: 17 Feb 2021, 23:07

Re: New vCPU instructions 2.0

Post by lb3361 »

You're keeping the state in sysArgs+7! Nifty.
qwertyface
Posts: 31
Joined: 16 Jul 2019, 09:19

Re: New vCPU instructions 2.0

Post by qwertyface »

Nice. Looping works out an awful lot smaller than trying to do the whole thing in one go, and will win in cost in a lot of cases. Are you planning to do the same thing for logical and arithmetic right shift? The same broad approach would work, but the cost of repeatedly entering the right-shift table is higher (and the page after the right-shift table is getting crowded).
at67
Posts: 383
Joined: 14 May 2018, 08:29

Re: New vCPU instructions 2.0

Post by at67 »

I've implemented a LSRB, (non PREFX instruction), in ROMvX0, so applying this technique, (as a PREFX restart-able instruction), to allow right byte shifts from 1 to 7 should be doable. But I was never able to get a LSRW implemented in under 30 cycles. In fact I wasn't even close, I think my best was around 44-46 cycles.

It may be possible to do a LSRW using PREFX, PREFX payload restart, (to process the low and high bytes on separate phases), and processing low byte MSB state in the sysArgs registers though.
qwertyface
Posts: 31
Joined: 16 Jul 2019, 09:19

Re: New vCPU instructions 2.0

Post by qwertyface »

Yes, I think to do both bytes and the transfer in 30 is not possible. If you could do it in two-phases it should work (but how would you dispatch to different code the on the second and subsequent calls?). My implementation of Arithmetic shift-right is 45 excluding the cost of dispatch and I thought that was fairly reasonable. You can save some cycles because you are working with fixed zero-page addresses and don't need to keep loading Y and X, and can probably put more code in the page after the shift-table, whereas I tried to keep it minimal.
at67
Posts: 383
Joined: 14 May 2018, 08:29

Re: New vCPU instructions 2.0

Post by at67 »

qwertyface wrote: 22 Jun 2021, 08:12 Yes, I think to do both bytes and the transfer in 30 is not possible. If you could do it in two-phases it should work (but how would you dispatch to different code the on the second and subsequent calls?). My implementation of Arithmetic shift-right is 45 excluding the cost of dispatch and I thought that was fairly reasonable. You can save some cycles because you are working with fixed zero-page addresses and don't need to keep loading Y and X, and can probably put more code in the page after the shift-table, whereas I tried to keep it minimal.
I implemented a 3 phase version of LSRW that cost just over 100 cycles, (pretty expensive), but only 2 bytes, (using PREFX1)...state was kept in sysArgs6 and sysArgs7. In the end I abandoned that approach and created a LSRV instruction instead in the 0x300 page...it's 52 cycles and 2 bytes but allows right shifting of any word variable in zero page space.

To make room for LSRV I removed SYS_Read3_40 as it's not needed if the Pictures application is MIA and I moved SYS_Unpack_56 out of the 0x0600 page into one of the other SYS pages. Obviously moving SYS_Unpack_56 for ROMvX0 means that any applications that use it at the old address will not run on ROMvX0 without a re-compile/re-assemble; I haven't found any applications that use it yet, (my search was not exhaustive though).

The novelty of this instruction is that it uses Marcel's SYS mechanism for retrying the instruction if there are not enough ticks left in the current scanline. This approach can be used to extend any instruction in ROMvX0 beyond the maxTicks=30 limit, (which should be extremely useful for large non restarting PREFX instructions).

Code: Select all

# pc = 0x031c, Opcode = 0x1c
# Instruction LSRV: Logical shift right word var, 52 cycles
label('LSRV')
ld(hi('lsrv#13'),Y)             #10 #12
jmp(Y,'lsrv#13')                #11
st([sysArgs+7],X)               #12 var


# LSRV implementation
label('lsrv#13')
ld((272 - 52//2) & 255)         #13 convert LSRV 52 cycles to vTicks delta, (272 for ROMvX0, 270 for ROMv1 to ROMv5a/DEVROM)
adda([vTicks])                  #14
blt('.lsrv#17')                 #15 not enough time left, so retry
ld(hi('shiftTable'),Y)          #16 logical shift right 1 bit (X >> 1)
ld('.lsrv#26')                  #17 continuation address
st([vTmp])                      #18
ld([X])                         #19 load byte from var
anda(0xfe)                      #20
jmp(Y,AC)                       #21
bra(255)                        #22 bra shiftTable+255
# continues in page 0x0600 at label('.lsrv#26') after fetching shifted byte from 0x0500
label('.lsrv#17')
ld([vPC])                       #17 retry instruction
suba(2)                         #18
st([vPC])                       #19
ld(hi('NEXTY'),Y)               #20
jmp(Y,'NEXTY')                  #21
ld(-24/2)                       #22


#-----------------------------------------------------------------------
#       vCPU LSRV
#-----------------------------------------------------------------------

label('.lsrv#26')
st([X])                         #26 shifted low byte
ld([sysArgs+7])                 #27 low byte adress
adda(1)                         #28
st([sysArgs+6],X)               #29 high byte address
ld([X])                         #30 bit 0 of high byte
anda(1)                         #31
adda(127)                       #32
anda(128)                       #33 
ld([sysArgs+7],X)               #34
ora([X])                        #35
st([X])                         #36 transfer to bit 7 of low byte
ld('.lsrv47')                   #37 continuation address
st([vTmp])                      #38
ld([sysArgs+6],X)               #39 
ld([X])                         #40
anda(0b11111110)                #41
jmp(Y,AC)                       #42
bra(255)                        #43 bra shiftTable+255
# continues in page 0x0600 at label('.lsrv#47') after fetching shifted byte from 0x0500
label('.lsrv47')
st([X])                         #47
ld(hi('NEXTY'),Y)               #48
jmp(Y,'NEXTY')                  #49
ld(-52/2)                       #50
Post Reply