Gigatron Hackers • New vCPU instructions 2.0 - Page 2

Page 2 of 10

Re: New vCPU instructions 2.0

Posted: 26 Feb 2021, 23:42

by at67

A few simple graphics benchmarks tested across all current ROM versions in Mode 2.

Code: Select all

1) Lines: draws 6 full screen lines 100 times.
2) Rects: draws 1 full screen filled rectangle 100 times.
3) Circles: draws 1 full screen filled circle 100 times.

: GraphicsPrims_bench.JPG (78.39 KiB) Viewed 3183 times

Re: New vCPU instructions 2.0

Posted: 10 Mar 2021, 01:42

by lb3361

I am so excited by your vSPH changed that I couldn't wait. I looked carefully at the dev.rom and, to my surprise, I was able to implement the change in the same footprint, just by squeezing instructions in the implementation of PUSH, POP, LDLW, and STLW. PUSH and POP are just as fast as they were. LDLW and STLW are one tick faster! Just in case it helps!

Changes here https://github.com/lb3361/gigatron-rom/ ... c1fff65dfc, summarized below

SYS_Reset:

Code: Select all

              0060 0000  ld   $00         652   ld(0)                           #17
              0061 c21c  st   [$1c]       653   st([vSP])                       #18 vSP
              0062 c204  st   [$04]       654   st([vSPH])                      #19 vSPH [new instruction]

POP instruction:

Code: Select all

                                          1630  label('POP')
POP:          0363 1504  ld   [$04],y     1631  ld([vSPH],Y)                    #10,15
              0364 111c  ld   [$1c],x     1632  ld([vSP],X)                     #11
              0365 0d00  ld   [y,x]       1633  ld([Y,X])                       #12
              0366 c21a  st   [$1a]       1634  st([vLR])                       #13
              0367 de00  st   [y,x++]     1635  st([Y,Xpp])                     #14  Just x++
              0368 0d00  ld   [y,x]       1636  ld([Y,X])                       #15
              0369 c21b  st   [$1b]       1637  st([vLR+1])                     #16
              036a 011c  ld   [$1c]       1638  ld([vSP])                       #17
              036b 8002  adda $02         1639  adda(2)                         #18
              036c c21c  st   [$1c]       1640  st([vSP])                       #19
                                          1641  label('.pop#20')
.pop#20:      036d 0116  ld   [$16]       1642  ld([vPC])                       #20
              036e a001  suba $01         1643  suba(1)                         #21
              036f c216  st   [$16]       1644  st([vPC])                       #22
              0370 fc00  bra  NEXTY       1645  bra('NEXTY')                    #23
              0371 00f3  ld   $f3         1646  ld(-26/2)                       #24

PUSH instruction:

Code: Select all

                                          1655  label('PUSH')
PUSH:         0375 1504  ld   [$04],y     1656  ld([vSPH],Y)                    #10
              0376 011c  ld   [$1c]       1657  ld([vSP])                       #11
              0377 a002  suba $02         1658  suba(2)                         #12
              0378 d21c  st   [$1c],x     1659  st([vSP],X)                     #13
              0379 011a  ld   [$1a]       1660  ld([vLR])                       #14
              037a de00  st   [y,x++]     1661  st([Y,Xpp])                     #15
              037b 011b  ld   [$1b]       1662  ld([vLR+1])                     #16
              037c ce00  st   [y,x]       1663  st([Y,X])                       #17
              037d fc6d  bra  .pop#20     1664  bra('.pop#20')                  #18
              037e 0200  nop              1665  nop()                           #19  !!!

STLW instruction

Code: Select all

                                          2020  label('stlw')
stlw:         0441 911c  adda [$1c],x     2021  adda([vSP],X)                   #13
              0442 1504  ld   [$04],y     2022  ld([vSPH],Y)                    #14
              0443 0118  ld   [$18]       2023  ld([vAC])                       #15
              0444 de00  st   [y,x++]     2024  st([Y,Xpp])                     #16
              0445 0119  ld   [$19]       2025  ld([vAC+1])                     #17
              0446 ce00  st   [y,x]       2026  st([Y,X])                       #18
              0447 1403  ld   $03,y       2027  ld(hi('REENTER'),Y)             #19
              0448 e0cb  jmp  y,$cb       2028  jmp(Y,'REENTER')                #20
              0449 00f4  ld   $f4         2029  ld(-24/2)                       #21
              044a 0200  nop              2030  nop()  # saved
              044b 0200  nop              2031  nop()  # saved

LDLW instruction

Code: Select all

                                          2034  label('ldlw')
ldlw:         044c 911c  adda [$1c],x     2035  adda([vSP],X)                   #13
              044d 1504  ld   [$04],y     2036  ld([vSPH],Y)                    #14
              044e 0d00  ld   [y,x]       2037  ld([Y,X])                       #15
              044f c218  st   [$18]       2038  st([vAC])                       #16
              0450 de00  st   [y,x++]     2039  st([Y,Xpp])                     #17  Just x++
              0451 0d00  ld   [y,x]       2040  ld([Y,X])                       #18
              0452 c219  st   [$19]       2041  st([vAC+1])                     #19
              0453 1403  ld   $03,y       2042  ld(hi('NEXTY'),Y)               #20
              0454 e000  jmp  y,$00       2043  jmp(Y,'NEXTY')                  #21
              0455 00f4  ld   $f4         2044  ld(-24/2)                       #22
              0456 0200  nop              2045  nop()  # saved

Re: New vCPU instructions 2.0

Posted: 10 Mar 2021, 07:21

by at67

Yeah the implementation of a 16bit vSP was trivial to be honest, I had the exact same tick values as you when I first did the change as well; but I used an extra level of indirection, (sacrificing a few ticks), to squeeze in many more instructions...I have 28 new instructions now.

I'm still not 100% settled on all of them and their usefulness, but I have crafted around 35 instructions and rotated them in and out of the compiler and vCPU examples to see which had the most affect on byte size and tick count. Performing a statistical analysis of instruction frequency/size/speed across all current applications and samples actually only provided minimal data, it's when you start hand coding tight loops that are 1/3 of the size and 3-4 times faster that you realise the potential of carefully picking new instructions.

I'll release the new ROM and the source shortly.

Re: New vCPU instructions 2.0

Posted: 10 Mar 2021, 12:09

by lb3361

Nobody knows better which instructions would help the basic compiler.

I was mostly thinking about what's needed to resurrect the C compiler to be honest. The main issues for a C compiler are to make sure we can do move things around easily, and in particular easily enough to deal with real stack frames. Otherwise the needs are similar to those of Basic I believe. The following observations may help your decisions.

Summary

If you had to pick only one of my proposals in addition to what you already have planned, please do the MOVWA one (the word version of your MOVBA). Next is MOVW (the word version of your MOVB). I am sure you'll find them useful for Basic as well.

1 - Moving things around without damaging vAC.

Modern code generators need registers. The only way to do so in the gigatron is to reserve a part of the page zero and call them "registers". Then we need to move things around. I believe the operations we need most are the following. I use the prefix MOV for all instructions that move things from/into zero page variables, keeping LD and ST for the ones that affect vAC, but that's purely cosmetic.

Code: Select all

MOV r1,r2		Replaces LD r1; ST r2 		Your MOVB
MOVW r1,r2		Replaces LDW r1; STW r2		Less efficient with two MOVBs. ==> Please consider this one!
MOVQ b,r1		Replaces LDI b; ST r1		Your LDQ
MOVQW b, r1		Replaces LDI b; STW r1		Your LDQW

2- Instruction to implement indexed addressing.

Pretty much all modern CPUs rely on loads and stores with indexed addressing. For us this means true 16 bits address calculation. I believe it is okay to explicitly compute addresses in vAC. For the equivalent of a load from disp(r1) to r2, we can do "LDWI disp; ADDW r1; DEEK; ST r2'. But the equvalent of a store from r2 to disp(r1) is very inconvenient: "LDWI disp; ADDW r1; STW tmp; LDW r2; DOKE". Basically POKE and DOKE work backwards. Instead of instructions to store vAC at addresses found in page zero, one needs instructions to store something found in page zero at the address contained in vAC.

Code: Select all

MOVBA r1	 	Replaces STW tmp; LD r1; POKE tmp		Your MOVBA in fact
MOVWA r1		Replaces STW tmp; LDW r1; DOKE tmp		Painful to do with two MOVBAs. ==> Please consider this one!

3- True 16 bits stack

The vSPH change is very useful but not yet a 16 bits stack. It is more like a 8 bit stack that can live in the page specified by vSPH. Out of the five instructions that deal with the stack, only PUSH and POP are widely used. The stack frame instructions ALLOC, LDLW, STLW are seldom used and nearly useless for C because they work with a 8 bit stack (they manipulate vSP but not vSPH). Also we're missing the byte variants LDLB and STLB.

I can see two ways to go.

The first approach tries to use LDLW/STLW as much as possible. When allocating a stack frame, the function prologue must now detect that changing only vSP would wrap around. When this is the case, we need to decrease vSPH, and we need to waste some stack space to ensure that we can access the required frame size using only LDLW and STLW. Finally we need to save the old values of vSP/vSPH and vLR in the new stack frame so that we can restore them in the function epilogue. To make this practical we would need byte variants of LDLW and STLW, and we would probably need a new opcode ENTER to help setting up the stack frame with all this page crossing testing business. Note that we don't need PUSH and POP anymore.

The second approach is to totally ignore the VCPU stack and instead implement a new one. This is made easier by the instructions that help indexed access because they allow us to access the stack frame variables. And we can use the approach used in C compilers for RISC processors that consists of only changing the stack pointer in the prologues and epilogues (that means, preallocating space for the arguments of the most complex function call).

I believe that solution (1) would be a bit faster but much more complicated. If we completely ignore vSP/vSPH, solution (2) does not require any new opcodes. If we want to still use vSP/vSPH to maximize interoperability, we might consider an opcode that computes a local variable address like this:

Code: Select all

LDLA imm		Replaces MOVB vSP,vAC, MOVB vSPH,vAC+1, ADDI imm

then rely on PEEK/DEEK/MOVBA/MOVWA to access the variable themselves.

Re: New vCPU instructions 2.0

Posted: 10 Mar 2021, 20:27

by at67

lb3361 wrote: ↑10 Mar 2021, 12:09 I was mostly thinking about what's needed to resurrect the C compiler to be honest. The main issues for a C compiler are to make sure we can do move things around easily, and in particular easily enough to deal with real stack frames. Otherwise the needs are similar to those of Basic I believe.

Personally I have no technical interest in the C compiler, I think it's a very poor fit for the Gigatron's default memory map and limited resources, but that doesn't mean I don't want it to succeed. The more options we have for software development on the Gigatron, the better, so if someone is willing to take up the mantle and see it through to completion and I can make their life a little easier, then I am up for that.

lb3361 wrote: ↑10 Mar 2021, 12:09 Summary
If you had to pick only one of my proposals in addition to what you already have planned, please do the MOVWA one (the word version of your MOVBA). Next is MOVW (the word version of your MOVB). I am sure you'll find them useful for Basic as well.

MOVWA is possible in 30 ticks or less outside of page3, I can add that one, (but I think you're understanding of how it works is backwards, it replaces PEEK, not POKE).

MOVW is just not possible in any other page other than page3, (MOVB takes 28 cycles outside of page3), and due to the current vCPU opcode usage, (branch target of the vCPU interpreter), the only way to find 28 words of space for a page3, 28 cycle instruction, is to move one of the 28 cycle instructions like ADDW or SUBW out of page3, (without breaking vCPU opcode compatibility).

Initially I was transfixed on having an XCHG r0, r1 instruction, (so that I could replace 6 instructions with 1 when swapping byte variables and 6 with 2 when swapping word variables), to make this work I had to increase maxticks from 28 to 32, move ADDW from page3, (28 cycles), to external page, (32 cycles), so that I could just barely fit a 28 cycle XCHG into page3.

It wasn't worth it, all code ran around 15% slower because of the maxticks change because there are less vCPU slots available per scanline as a result of the firmware's simple, (but fast), vCPU slot allocation mechanism.

In the end I settled on maxticks=30 as a good compromise between speed and new instruction feature expandability, this allowed me to move almost all of the instructions out of page3 with either 2 or 3 instruction prologues and 3 instruction instead of 2 instruction epilogues, (the important instructions, ADDW/SUBW/BCC etc, can't be moved).

The prologues and epilogues add an extra burden of either 3 or 4 cycles to your cycle allocation and if you create an instruction with 2 operands, (like MOVB), you have to add extra cycles to parse the 2nd operand and then another 3 cycles to fix vPC, all of these extra cycle allocations before you've even begun to implement your instruction, (which must fit in a max of 30 cycles).

I actually spent the first week trying to re-write the entire vCPU interpreter so that all opcodes had an extra level of indirection, so that page 3 was purely used as a jump table and contained no actual instruction code, I also tried to remove the automatic +2 to vPC within the vCPU dispatch and have each instruction do the vCPU fixup itself, which would now be possible because page3 space limitations were no longer a factor, (given the above).

Both these ideas require a complete rethink of how the vCPU interpreter works and I am not sure they are even possible, (whilst retaining 100% software compatibility), i.e. apps, emulators, code all rely on the -2 vPC fixup as vPC is pre-incremented by 2 each vCPU dispatch.

lb3361 wrote: ↑10 Mar 2021, 12:09 1 - Moving things around without damaging vAC.

This was one of my main goals in creating new instructions.

lb3361 wrote: ↑10 Mar 2021, 12:09 Modern code generators need registers. The only way to do so in the gigatron is to reserve a part of the page zero and call them "registers". Then we need to move things around. I believe the operations we need most are the following. I use the prefix MOV for all instructions that move things from/into zero page variables, keeping LD and ST for the ones that affect vAC, but that's purely cosmetic.

I like this formality a lot, I will adopt it and rename the affected instructions appropriately.

lb3361 wrote: ↑10 Mar 2021, 12:09 2- Instruction to implement indexed addressing.

Here was another area I spent a considerable amount of time on, I tried everything I could think of to come up with some sort of useful indexing, (array instruction), none of my ideas were possible outside of page3 with maxticks=30 and given the cycle limitations noted above.

An optimised ADDW in page3 takes 28 cycles, you could get that down to 26 cycles in page 3, by having two separate code paths for the carry and borrow code, but once again there is no room if you wish to remain opcode compatible, (it's a shame that the hardware carry/borrow is thrown away and not made accessible to native code, having to calculate it in SW each time you need it, really stings).

So 16bit arithmetic outside of page 3 for indexed modes is not possible, indexed modes with 8 bit operands are possible and I initially implemented an instruction to do so and found it mostly useless in my experiments, I'm sure in a bespoke application it could redeem itself.

lb3361 wrote: ↑10 Mar 2021, 12:09 Pretty much all modern CPUs rely on loads and stores with indexed addressing. For us this means true 16 bits address calculation. I believe it is okay to explicitly compute addresses in vAC. For the equivalent of a load from disp(r1) to r2, we can do "LDWI disp; ADDW r1; DEEK; ST r2'. But the equvalent of a store from r2 to disp(r1) is very inconvenient: "LDWI disp; ADDW r1; STW tmp; LDW r2; DOKE". Basically POKE and DOKE work backwards. Instead of instructions to store vAC at addresses found in page zero, one needs instructions to store something found in page zero at the address contained in vAC.

Just a quick addition for anyone following along, I assume you meant:

Code: Select all

LDWI   disp
ADDW   r1
STW    tmp
LDW    r2
DOKE   tmp

lb3361 wrote: ↑10 Mar 2021, 12:09

Code: Select all

MOVBA r1	 	Replaces STW tmp; LD r1; POKE tmp		Your MOVBA in fact
MOVWA r1		Replaces STW tmp; LDW r1; DOKE tmp		Painful to do with two MOVBAs. ==> Please consider this one!

This is where I think the misunderstanding is, my 'MOVBA tmp' replaces 'PEEK, ST tmp', it doesn't replace the yukky POKE example you gave above, but we could add a POKEA.

This would change this instruction carnage:

Code: Select all

LDWI   disp
ADDW   r1
STW    tmp
LDW    r2
POKE   tmp

To the more managable:

Code: Select all

LDWI   disp
ADDW   r1
POKEA  r2

lb3361 wrote: ↑10 Mar 2021, 12:09 3- True 16 bits stack
I can see two ways to go.

The first approach tries to use LDLW/STLW as much as possible. When allocating a stack frame, the function prologue must now detect that

The second approach is to totally ignore the VCPU stack and instead implement a new one. This is made easier by the instructions that help

I believe that solution (1) would be a bit faster but much more complicated. If we completely ignore vSP/vSPH, solution (2) does not require any new opcodes. If we want to still use vSP/vSPH to maximize interoperability, we might consider an opcode that computes a local variable address like this:
Code: Select all
LDLA imm		Replaces MOVB vSP,vAC, MOVB vSPH,vAC+1, ADDI imm  		
then rely on PEEK/DEEK/MOVBA/MOVWA to access the variable themselves.

I like option 2, for the BASIC compiler a 256 byte stack page is more than enough, the only real stack requirement it has, is for CALL/CALLI return's and thus far I have been making do with 16 bytes for that, (and 8 bytes for parameters and local variables within procedures in a separate part of zero page), so 256 bytes of stack space is like flying space unicorns farting out gold coins into my lap.

You could also keep the vSP/vSPH/256 byte stack for hardware CALL's, (128 levels of nested functions/recursion, probably not be enough for diehard C coders), and implement the rest of the stack functionality as a SW stack as you suggested.

Re: New vCPU instructions 2.0

Posted: 10 Mar 2021, 22:05

by lb3361

You're right. I did not understand your MOVEBA. What I would like are instructions to avoid what you call the instruction "carnage" in both word and byte form. POKEA and DOKEA are good enough names.

In fact the carnage appears whenever you want to do write into a field of a structure located somewhere in memory. Suppose p is a pointer to struct { ... int x; ... }. Then to access field x, you need to put p in some memory location and do an indexed access with the offset of x.

Code: Select all

   p->x = value becomes   LDWI offset, ADDW p, DOKEA value

It also happens with an array, except that you need to do the address calculation

Code: Select all

  p->x[i] = value becomes  LD i, LSLW, ADDI offset, ADDW p, DOKEA value 
  and x[i] = value becomes  LD i, LSLW, ADDW x, DOKEA value

I'd be surprised if you didn't have these problems in Basic as well to some level.

As for the stack, I realize that there is no need to abide by the vSP stack to be interoperable. The CALL{I} and RET functions do not know anything about the stack. They just know about vLR. So as long as we eventually return to the address found in vLR, everything should work.

Thanks for the quick reply.

Re: New vCPU instructions 2.0

Posted: 11 Mar 2021, 02:00

by at67

lb3361 wrote: ↑10 Mar 2021, 22:05 I'd be surprised if you didn't have these problems in Basic as well to some level.

I do have these issues, I use helpers like these, (their function pointers live in zero page RAM rather than needing the LDWI/CALL thunk for ROM's < 5a):

Code: Select all

%SUB                convert8Arr2d
convert8Arr2d       ADDW    memIndex0
                    ADDW    memIndex0
                    DEEK
                    ADDW    memIndex1
                    STW     memAddr
                    RET
%ENDS

%SUB                convert8Arr3d
convert8Arr3d       ADDW    memIndex0
                    ADDW    memIndex0
                    DEEK
                    ADDW    memIndex1
                    ADDW    memIndex1
                    DEEK
                    ADDW    memIndex2
                    STW     memAddr
                    RET
%ENDS

%SUB                convert16Arr2d
convert16Arr2d      ADDW    memIndex0
                    ADDW    memIndex0
                    DEEK
                    ADDW    memIndex1
                    ADDW    memIndex1
                    STW     memAddr
                    RET
%ENDS

%SUB                convert16Arr3d
convert16Arr3d      ADDW    memIndex0
                    ADDW    memIndex0
                    DEEK
                    ADDW    memIndex1
                    ADDW    memIndex1
                    DEEK
                    ADDW    memIndex2
                    ADDW    memIndex2
                    STW     memAddr
                    RET
%ENDS

I am planning on replacing these with hand crafted sys calls, (which are coded the same way as vCPU instructions without the maxtick limitation), they will obviously be specific to the array topology of gtBASIC, but there is plenty of ROM space to create helpers/accessors for other languages as well, e.g. C.

These sys calls will use the predefined set of registers, (sys args), so you will still have to load the base and displacement, but there will be plenty of opportunities for peephole and local optimisations using these registers, (in my compiler anyway), that aren't there now.

lb3361 wrote: ↑10 Mar 2021, 22:05 As for the stack, I realize that there is no need to abide by the vSP stack to be interoperable. The CALL{I} and RET functions do not know anything about the stack. They just know about vLR. So as long as we eventually return to the address found in vLR, everything should work.

Very true, vLR allows you to implement any kind of stack system that you like.

lb3361 wrote: ↑10 Mar 2021, 22:05 Thanks for the quick reply.

No problem, it's nice to get some engagement on these new instructions.

Re: New vCPU instructions 2.0

Posted: 11 Mar 2021, 16:23

by lb3361

at67 wrote: ↑11 Mar 2021, 02:00 I am planning on replacing these with hand crafted sys calls, (which are coded the same way as vCPU instructions without the maxtick limitation), they will obviously be specific to the array topology of gtBASIC, but there is plenty of ROM space to create helpers/accessors for other languages as well, e.g. C.

The problem is that sys calls (especially the long ones) do not schedule as well as short instructions. The VCPU might stall until one reaches a blank videoline. So it might make sense to split the actual address calculation into well thought instructions. I would not be surprised if

Code: Select all

  ADDW r1
  ADDW r1
  DEEK or DOKEA    [this is where I don't get how you use the normal DOKE, btw]

turned out to run faster on average than a SYS call that does the same thing in one piece, even if we ignore the SYS call setup time. On a related topic, I just found out that the v6502 splits each 6502 opcode in two separately scheduled pieces: fetching arguments with the right addressing mode and executing the instruction.

Where we really need SYS calls are multibyte arithmetic operations and copy operations. Besides the 16 bits mulu,muls,divu,divs, we should think of the 32 bits add/sub/neg/mulu/muls/divu/divs, and maybe of floating point operations. For the copy operations, we could think of memset/memcpy/memcpytobank with maybe fast paths when the low byte of both source and destination are identical. The difficulty is of course that we cannot do these operations in one go and we have to split them into small pieces that are executed through repeated invocations of the SYS call. Doing this right takes a lot of time.

Re: New vCPU instructions 2.0

Posted: 11 Mar 2021, 21:21

by at67

lb3361 wrote: ↑11 Mar 2021, 16:23 The problem is that sys calls (especially the long ones) do not schedule as well as short instructions. The VCPU might stall until one reaches a blank

That's true to some extent, but the advantage of sys calls is that the more densely you compress the call with the equivalent functionality of separate vCPU instructions, the more efficient the sys call becomes. Each vCPU instruction has a minimum cost of 9 prologue ticks, (more like 12 when you factor in that 95% of instructions have been moved out of page3 using indirection, to make room for more instructions), and an epilogue cost of 5 or 6 ticks, so that's 18 ticks, (and it's 21 if you need to fix vPC), per instruction. A sys call has a cost of 17 + 6 worst case, so 23 per sys call, vs 90 for a block of 5 vCPU instructions, 23 vs 180 for 10 instructions, etc. Then the sys call has the advantage when it comes to self-restarting, the only way vCPU can match that is to unroll the vCPU loop and then you get to very quickly say goodbye to your precious RAM, so comparing a self restarting sys call to a block of vCPU code in a loop is even more in favour of the sys call.

Here are some examples:

ROMv5a Mandelbrot using fixed point and symmetry around X axis cheat.

: Mandel1.png (48.28 KiB) Viewed 3044 times

ROMvX0 Mandelbrot using fixed point and symmetry around X axis cheat as well as new instructions and int16 sys call for multiply.

: Mandel2.png (47.23 KiB) Viewed 3044 times

https://www.youtube.com/watch?v=o226dcTOQEY vs https://www.youtube.com/watch?v=KdskySMhgig

lb3361 wrote: ↑11 Mar 2021, 16:23 videoline. So it might make sense to split the actual address calculation into well thought instructions. I would not be surprised if
Code: Select all
  ADDW r1
  ADDW r1
  DEEK or DOKEA    [this is where I don't get how you use the normal DOKE, btw]
turned out to run faster on average than a SYS call that does the same thing in one piece, even if we ignore the SYS call setup time. On a related topic,

You're right about this short segment, replacing this with a sys call would be nonsense, not only would the sys call itself probably take just about as long as well as be harder to schedule, loading it's sysArg registers would take as long if not longer than the segment you are replacing in the first place, (even before you have called the sys call).

Using sys calls as array accessers/helpers is only for the more complex 2d and 3d arrays where there is multiple levels of indirection and at least 10 vCPU instructions performing the task in subroutines.

lb3361 wrote: ↑11 Mar 2021, 16:23 Where we really need SYS calls are multibyte arithmetic operations and copy operations. Besides the 16 bits mulu,muls,divu,divs, we should think of the 32 bits add/sub/neg/mulu/muls/divu/divs, and maybe of floating point operations. For the copy operations, we could think of memset/memcpy/memcpytobank with maybe fast paths when the low byte of both source and destination are identical. The difficulty is of course that we cannot do these operations in one go and we have to split them into small pieces that are executed through repeated invocations of the SYS call. Doing this right takes a lot of time.

I have already implemented the following functions as self-restarting sys calls and they work fantastically well, usually around 3-5 times faster than the equivalent ROMvX0 tight vCPU loop, (5-6+ times faster for earlier ROM's).

Code: Select all

- int16 multiply inner loop
- int16 divide inner loop
- line draw 2 pixel per iteration inner loop
- blit screen to memory
- page memcpy
- full memcpy
- scrollVTableY
- offsetVTableY
- *unfinished* vertical blank sprites that save/restore and have colourkey/transparency

Re: New vCPU instructions 2.0

Posted: 11 Mar 2021, 21:37

by at67

Update:

Thanks to lb3361 for the idea of a more formal and consistent naming scheme:

NOTB was removed
XORB was removed

XORI was moved from page3 to an external page, cycle time increased from 14 to 20

MOVBA was renamed to MOVA
MOVB was renamed to MOV
LDQW was renamed to MOVQW
LDQ was renamed to MOVQ

Added the MOVAW instruction, 30 cycles, move word from [vAC] to var
Added the MOVV instruction, 22 cycles, move a byte from var to [vAC]
Added the MOVVW instruction, 30 cycles, move a word from var to [vAC]

Code: Select all

# MOVAW implementation, move word from [vAC] to var
label('movaw#13')
ld([vAC+1],Y)                   #13
ld([vAC],X)                     #14
ld([Y,X])                       #15 peek [vAC]
st([Y,Xpp])                     #16
st([sysArgs+6])                 #17
ld([Y,X])                       #18 peek [vAC+1]
st([sysArgs+7])                 #19
ld(0,Y)                         #20
ld([vTmp],X)                    #21 dst zero page variable
ld([sysArgs+6])                 #22
st([Y,Xpp])                     #23 [var.lo] = peek [vAC]
ld([sysArgs+7])                 #24
st([Y,X])                       #25 [var.hi] = peek [vAC+1]
ld(hi('NEXTY'),Y)               #26
jmp(Y,'NEXTY')                  #27
ld(-30/2)                       #28

# pc = 0x035b, Opcode = 0x5b
# Instruction MOVV: Move a byte from var to [vAC], 22 cycles
label('MOVV') 
ld(hi('movv#13'),Y)             #10 #12
jmp(Y,'movv#13')                #11
ld(AC,X)                        #12
# MOVV implementation
label('movv#13')
ld([X])                         #13 [var.lo]
ld([vAC],X)                     #14
ld([vAC+1],Y)                   #15
st([Y,X])                       #16 poke [vAC], [var.lo]
ld(hi('REENTER'),Y)             #17
jmp(Y,'REENTER')                #18
ld(-22/2)                       #19

# pc = 0x0360, Opcode = 0x60
# Instruction MOVVW: Move a word from var to [vAC], 30 cycles
label('MOVVW') 
ld(hi('movvw#13'),Y)            #10 #12
jmp(Y,'movvw#13')               #11
st([vTmp],X)                    #12
# MOVVW implementation
label('movvw#13')
ld([X])                         #13 [var.lo]
ld([vAC],X)                     #14
ld([vAC+1],Y)                   #15
st([Y,X])                       #16 poke [vAC], [var.lo]
ld([vTmp])                      #17
adda(1,X)                       #18
ld([X])                         #19 [var.hi]
st([vTmp])                      #20
ld([vAC])                       #21
adda(1,X)                       #22
ld([vTmp])                      #23
st([Y,X])                       #24 poke [vAC+1], [var.hi]
ld(hi('REENTER'),Y)             #25
jmp(Y,'REENTER')                #26
ld(-30/2)                       #27