suggestion on optimization

Page 3/5
1 | 2 | | 4 | 5

By bore

Expert (115)

bore's picture

05-04-2019, 01:41

Oh, another thing:

	ld	de,128+16
	add	hl,de		; hl = enemy.y - ymap + 16
	jr	nc,.next	; !(-16 <= enemy.y - ymap < 128

Makes sure that hl is in the range 0 to 128+16

This means that

	ld	a,l
	add	a,64-16		; a = enemy.y - ymap + 64

always clears the carry.

So the "and a" a bit further down isn't strictly necessary.

By ARTRAG

Enlighted (6236)

ARTRAG's picture

05-04-2019, 09:52

I've taken 2 out 3 of your suggestions
Thanks


	struct sat
y		db	0
x		db	0
f		db	0
c		db	0
	ends


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
;	plot enemies and bullets if visible in the current SAT in ram
;
;	depends on xmap,ymap

_plot_enemy:

	ld	iy,(alt_ram_sat)
	ld	ix,enemies 
	ld	bc,(max_enem + max_plyr_bullets + max_enem_bullets)*256+0
	
	ld	hl,-128
	ld	de,(ymap)
	and a
	sbc	hl,de
	ld	(tempy),hl

	ld	hl,(xmap)
	ld	de,-32
	add	hl,de
	ld	(tempx),hl

.npc_loop1:
	bit 0,(ix+enemy_data.status)
	jp	z,.invisible

	ld	l,(ix+enemy_data.y+0)
	ld	h,(ix+enemy_data.y+1)
	ld	de,(tempy)
	
	add	hl,de			; hl = enemy.y - (ymap + 128)
	ld	de,128+16		; hl = enemy.y - (ymap + 128) + 128 + 16 >=0 
	add	hl,de			; hl = enemy.y - ymap + 16 >=0
	jr	nc,.invisible	; !(-16 <= enemy.y - ymap < 128)

	ld	a,l
	add	a,64-16			; a = enemy.y - ymap + 64	
	ld	(iy+sat.y+0),a
	ld	(iy+sat.y+4),a	; not needed if single layer but in this way it is overall faster 
	
	ld	l,(ix+enemy_data.x+0)
	ld	h,(ix+enemy_data.x+1)
	ld	de,(tempx)
						; CF is reset by previous add
	sbc hl,de			; hl = enemy.x + 32 - xmap < 0
	jp	m,.invisible	; hl <0  <==> dx = enemy.x - xmap < -32
	
	ld	de,32
	sbc hl,de		; enemy.x + 32 - xmap - 32 <0

	ld	a,(ix+enemy_data.color)
	jp nc,.noec		; -32< dx <0
	or	128			; set EC
	add	hl,de		; add 32
.noec
	ld	e,a
	ld	a,h
	and a
	jp	nz,.invisible	; dx >255
	
	ld	a,(ix+enemy_data.frame)
	ld	(iy+sat.x),l				; write X
	ld	(iy+sat.f),a				; write shape
	ld	(iy+sat.c),e				; write colour
	ld	(ix+enemy_data.plane),c		; save SAT plane
	inc c
	set 7,(ix+enemy_data.status)	; set it as visible
	cp	16*4						; hard coded in the SPT
	jp	nc,.two_layers

.one_layer:

	ld	e,sat
	add iy,de
	jp 	.next
	
.invisible
	res 7,(ix+enemy_data.status)	; set it as invisible
		
.next:
	ld	de,enemy_data
	add ix,de
	djnz	.npc_loop1

	ld	a,c
	ld	(alt_visible_sprts),a
	ret
	
.two_layers:
	
	ld	(iy+sat.x+4),l				; second layer X
	add	a,4
	ld	(iy+sat.f+4),a				; second layer shape
	ld	a,e
	and 0xF0
	inc	a							; second layer is always black
	ld	(iy+sat.c+4),a	
	inc c
	ld	e,2*sat
	add iy,de
	jp 	.next

By bore

Expert (115)

bore's picture

05-04-2019, 14:30

OK.
I would probably have placed .invisible after the ret and made a jump back to .next.
It will cost a few cycles more in the case when many sprites are invisible, but in that case you save a lot of cycles by not writing it to the sat. (Same reasoning as why to use relative jumps to .invisible instead of absolute whenever possible.)

While it might be more costly when there are few sprites on the screen the time you want to save as many cycles as possible is when there are many active sprites.

By bore

Expert (115)

bore's picture

05-04-2019, 15:45

This may or may not work but could be worth trying.

	ld	hl,32
	ld	de,(xmap)
	and	a
	sbc	hl,de
	ld	(tempx),hl
	ld	l,(ix+enemy_data.x+0)
	ld	h,(ix+enemy_data.x+1)
	ld	de,(tempx)

	add	hl,de      	; hl = enemy.x - xmap + 32
	ld	de,-256-32
	add	hl,de		; hl = enemy.x - xmap - 256
	jr	c,.invisible	; !(-32 <= enemy.x - xmap < 256)

	ld	a,(ix+enemy_data.color)
	inc	h		; z if 0 <= enemy.x - xmap < 256
	jp	z,.noec	; -32< dx <0
	or	128		; set EC
	sbc	hl,de		; add 32 (e=-32)
.noec
	ld	h,(ix+enemy_data.frame)
	ld	(iy+sat.x),l				; write X
	ld	(iy+sat.f),h				; write shape
	ld	(iy+sat.c),a				; write colour
	ld	(ix+enemy_data.plane),c		; save SAT plane        

Range check is inverted compared to y and sets carry if out of range.

By ARTRAG

Enlighted (6236)

ARTRAG's picture

06-04-2019, 10:29

Humm, I've tested your last modification and it does not work. I will try to investigate on it.
About jumps I expect to have more objects off screen than on screen, so .invisible should be the preferred branch
Moreover I expect more two colors items on screen than single layer items (bullets)
Anyway I'm very happy about the current level of optimization, thanks a lot!

By bore

Expert (115)

bore's picture

06-04-2019, 11:00

I shuffled around the registers used for writing to sat at the end, so the .two_layers writes have to be updated too.
I also didn't see the "cp 16*4" so you might need to change the last part to get back to the old register usage

.noec
	ld	e,a
	ld	a,(ix+enemy_data.frame)
	ld	(iy+sat.x),l				; write X
	ld	(iy+sat.f),a				; write shape
	ld	(iy+sat.c),e				; write colour
	ld	(ix+enemy_data.plane),c		; save SAT plane  

If you expect to mostly have two layers you could switch place of .two_layers and .one_layer and use relative jump again since it will fall through faster.

By ARTRAG

Enlighted (6236)

ARTRAG's picture

06-04-2019, 23:03

OK It works thanks! I've had to return ld e,sat to ld de,sat but overall it is much more faster now
This is the last version


	struct sat
y		db	0
x		db	0
f		db	0
c		db	0
	ends


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
;	plot enemies and bullets if visible in the current SAT in ram
;
;	depends on xmap,ymap

_plot_enemy:

	ld	iy,(alt_ram_sat)
	ld	ix,enemies 
	ld	bc,(max_enem + max_plyr_bullets + max_enem_bullets)*256+0
	
	ld	hl,-128
	ld	de,(ymap)
	and a
	sbc	hl,de
	ld	(tempy),hl

	ld	hl,32
	ld	de,(xmap)
	sbc	hl,de			; cf is always reset by previous sbc
	ld	(tempx),hl		; hl = 32 - xmap 

.npc_loop1:
	bit 0,(ix+enemy_data.status)
	jp	z,.invisible

	ld	l,(ix+enemy_data.y+0)
	ld	h,(ix+enemy_data.y+1)
	ld	de,(tempy)
	
	add	hl,de			; hl = enemy.y - (ymap + 128)
	ld	de,128+16		; hl = enemy.y - (ymap + 128) + 128 + 16 >=0 
	add	hl,de			; hl = enemy.y - ymap + 16 >=0
	jr	nc,.invisible	; !(-16 <= enemy.y - ymap < 128)

	ld	a,l
	add	a,64-16			; a = enemy.y - ymap + 64	
	ld	(iy+sat.y+0),a
	ld	(iy+sat.y+4),a	; not needed if single layer but in this way it is overall faster 
	
	ld	l,(ix+enemy_data.x+0)
	ld	h,(ix+enemy_data.x+1)
	ld	de,(tempx)

	add	hl,de      		; hl = enemy.x - xmap + 32
	ld	de,-256-32
	add	hl,de			; hl = enemy.x - xmap - 256
	jr	c,.invisible	; !(-32 <= enemy.x - xmap < 256)

	ld	a,(ix+enemy_data.color)
	inc	h				; z if 0 <= enemy.x - xmap < 256
	jp	z,.noec			; -32< dx <0
	or	128				; set EC
	sbc	hl,de			; add 32 (e=-32)
.noec:
	ld	e,a	
	ld	a,(ix+enemy_data.frame)
	ld	(iy+sat.x),l				; write X
	ld	(iy+sat.f),a				; write shape
	ld	(iy+sat.c),e				; write colour
	ld	(ix+enemy_data.plane),c		; save SAT plane    
	inc c
	set 7,(ix+enemy_data.status)	; set it as visible
	cp	16*4					; hard coded in the SPT
	jp	nc,.two_layers

.one_layer:

	ld	de,sat
	add iy,de
	jp 	.next
	
.invisible
	res 7,(ix+enemy_data.status)	; set it as invisible
		
.next:
	ld	de,enemy_data
	add ix,de
	djnz	.npc_loop1

	ld	a,c
	ld	(alt_visible_sprts),a
	ret
	
.two_layers:
	ld	(iy+sat.x+4),l				; second layer X
	add	a,4
	ld	(iy+sat.f+4),a				; second layer shape
	ld	a,e
	and 0xF0
	inc	a				; second layer is always black
	ld	(iy+sat.c+4),a	
	inc c
	ld	de,2*sat
	add iy,de
	jp 	.next

By ricbit

Champion (437)

ricbit's picture

07-04-2019, 23:33

I was able to optimize it a bit.

Here's the original version:

; x - xmap < -32      ; 28 clocks
; -32 <= x - xmap < 0 ; 134 clocks
; 0 <= x - xmap < 256 ; 114 clocks
; x - xmap >= 256     ; 114 clocks 

original:
    sbc  hl, de                    ; 17  
    jp   m, invisible              ; 11

    ld   de, 32                    ; 11
    sbc  hl, de                    ; 17

    ld   a, (ix+enemy_data_color)  ; 21
    jp   nc, noec                  ; 11
    or   128                       ; 8
    add  hl, de                    ; 12
noec:
    ld   e, a                      ; 5
    ld   a, h                      ; 5
    and  a                         ; 5
    jp   nz, invisible             ; 11

Here's my proposal:

; x - xmap < -32      ; 28 clocks
; -32 <= x - xmap < 0 ; 106 clocks
; 0 <= x - xmap < 256 ; 114 clocks
; x - xmap >= 256     ; 80 clocks 

proposal:
    sbc  hl, de                    ; 17  
    jp   c, invisible              ; 11

    ld   a, l                      ; 5
    sub  32                        ; 8
    ld   e, a                      ; 5 
    ld   a, h                      ; 5 
    sbc  a, 0                      ; 8
    jr   c, has_ec                 ; 13/8
    jr   nz, invisible             ; 13/8

    ld   l, e                      ; 5

has_ec:
    and  128                       ; 8
    or   (ix+enemy_data_color)     ; 21
    ld   e, a                      ; 5

By ARTRAG

Enlighted (6236)

ARTRAG's picture

08-04-2019, 18:28

Hi Ricbit, thanks for the suggestions
I will try to include them tonight!

By ricbit

Champion (437)

ricbit's picture

08-04-2019, 18:54

This was a mix of superoptimization and coding by hand (superopt currently can't deal with the zero flag).

Page 3/5
1 | 2 | | 4 | 5