Ver código fonte

Optimise decode/read path operations in a C extension module.

Keir Fraser 4 anos atrás
pai
commit
17607c3b5c

+ 2 - 0
.gitignore

@@ -15,4 +15,6 @@
 *.scp
 *.pyc
 scripts/greaseweazle/version.py
+scripts/greaseweazle/optimised/optimised*
+scripts/c_ext/build/
 Greaseweazle-*

+ 9 - 2
Makefile

@@ -2,7 +2,7 @@
 export FW_MAJOR := 0
 export FW_MINOR := 23
 
-TARGETS := all blinky clean dist windist mrproper ocd flash start serial
+TARGETS := all blinky clean dist windist mrproper ocd flash start serial pysetup
 .PHONY: $(TARGETS)
 
 ifneq ($(RULES_MK),y)
@@ -33,6 +33,7 @@ blinky:
 		Blinky.elf Blinky.bin Blinky.hex
 
 clean::
+	rm -rf scripts/greaseweazle/optimised/optimised* scripts/c_ext/build
 	rm -f *.hex *.upd scripts/greaseweazle/*.pyc
 	rm -f scripts/greaseweazle/version.py
 	find . -name __pycache__ | xargs rm -rf
@@ -52,8 +53,10 @@ dist:
 	cp -a README.md $(PROJ)-$(VER)/
 	cp -a gw $(PROJ)-$(VER)/
 	cp -a scripts/49-greaseweazle.rules $(PROJ)-$(VER)/scripts/
+	cp -a scripts/setup.sh $(PROJ)-$(VER)/scripts/
 	cp -a scripts/gw.py $(PROJ)-$(VER)/scripts/
 	cp -a scripts/greaseweazle $(PROJ)-$(VER)/scripts
+	cp -a scripts/c_ext $(PROJ)-$(VER)/scripts
 	cp -a scripts/misc/*.py $(PROJ)-$(VER)/scripts/misc/
 	cp -a RELEASE_NOTES $(PROJ)-$(VER)/
 	$(MAKE) clean
@@ -64,12 +67,13 @@ dist:
 	$(MAKE) clean
 	$(ZIP) $(PROJ)-$(VER).zip $(PROJ)-$(VER)
 
-windist:
+windist: pysetup
 	rm -rf $(PROJ)-$(VER) ipf ipf.zip
 	[ -e $(PROJ)-$(VER).zip ] || \
 	curl -L https://github.com/keirf/Greaseweazle/releases/download/$(VER)/$(PROJ)-$(VER).zip --output $(PROJ)-$(VER).zip
 	$(UNZIP) $(PROJ)-$(VER).zip
 	cp -a scripts/setup.py $(PROJ)-$(VER)/scripts
+	cp -a scripts/greaseweazle/optimised/optimised* $(PROJ)-$(VER)/scripts/greaseweazle/optimised
 	cd $(PROJ)-$(VER)/scripts && $(PYTHON) setup.py build
 	cp -a $(PROJ)-$(VER)/scripts/build/exe.win*/* $(PROJ)-$(VER)/
 	cp -a $(PROJ)-$(VER)/lib/bitarray/VCRUNTIME140.DLL $(PROJ)-$(VER)/
@@ -87,6 +91,9 @@ scripts/greaseweazle/version.py: Makefile
 	echo "major = $(FW_MAJOR)" >$@
 	echo "minor = $(FW_MINOR)" >>$@
 
+pysetup:
+	PYTHON=$(PYTHON) . ./scripts/setup.sh
+
 BAUD=115200
 DEV=/dev/ttyUSB0
 

+ 247 - 0
scripts/c_ext/optimised.c

@@ -0,0 +1,247 @@
+
+#define PY_SSIZE_T_CLEAN
+#include "Python.h"
+#include <stdio.h>
+#include <stdint.h>
+
+#define FLUXOP_INDEX   1
+#define FLUXOP_SPACE   2
+#define FLUXOP_ASTABLE 3
+
+/* bitarray.append(value) */
+static PyObject *append_s;
+static int bitarray_append(PyObject *bitarray, PyObject *value)
+{
+    PyObject *res = PyObject_CallMethodObjArgs(
+        bitarray, append_s, value, NULL);
+    if (res == NULL)
+        return 0;
+    Py_DECREF(res);
+    return 1;
+}
+
+static PyObject *
+flux_to_bitcells(PyObject *self, PyObject *args)
+{
+    /* Parameters */
+    PyObject *bit_array, *time_array, *revolutions;
+    PyObject *index_iter, *flux_iter;
+    double freq, clock_centre, clock_min, clock_max;
+    double pll_period_adj, pll_phase_adj;
+
+    /* Local variables */
+    PyObject *item;
+    double clock, new_ticks, ticks, to_index;
+    int zeros, nbits;
+
+    if (!PyArg_ParseTuple(args, "OOOOOdddddd",
+                          &bit_array, &time_array, &revolutions,
+                          &index_iter, &flux_iter,
+                          &freq, &clock_centre, &clock_min, &clock_max,
+                          &pll_period_adj, &pll_phase_adj))
+        return NULL;
+
+    nbits = 0;
+    ticks = 0.0;
+    clock = clock_centre;
+
+    /* to_index = next(index_iter) */
+    item = PyIter_Next(index_iter);
+    to_index = PyFloat_AsDouble(item);
+    Py_DECREF(item);
+    if (PyErr_Occurred())
+        return NULL;
+
+    /* for x in flux_iter: */
+    assert(PyIter_Check(flux_iter));
+    while ((item = PyIter_Next(flux_iter)) != NULL) {
+
+        double x = PyFloat_AsDouble(item);
+        Py_DECREF(item);
+        if (PyErr_Occurred())
+            return NULL;
+
+        /* Gather enough ticks to generate at least one bitcell. */
+        ticks += x / freq;
+        if (ticks < clock/2)
+            continue;
+
+        /* Clock out zero or more 0s, followed by a 1. */
+        for (zeros = 0; ; zeros++) {
+
+            /* Check if we cross the index mark. */
+            to_index -= clock;
+            if (to_index < 0) {
+                if (PyList_Append(revolutions, PyLong_FromLong(nbits)) < 0)
+                    return NULL;
+                nbits = 0;
+                item = PyIter_Next(index_iter);
+                to_index += PyFloat_AsDouble(item);
+                Py_DECREF(item);
+                if (PyErr_Occurred())
+                    return NULL;
+            }
+
+            nbits += 1;
+            ticks -= clock;
+            if (PyList_Append(time_array, PyFloat_FromDouble(clock)) < 0)
+                return NULL;
+            if (ticks < clock/2) {
+                if (!bitarray_append(bit_array, Py_True))
+                    return NULL;
+                break;
+            }
+
+            if (!bitarray_append(bit_array, Py_False))
+                return NULL;
+
+        }
+
+        /* PLL: Adjust clock frequency according to phase mismatch. */
+        if (zeros <= 3) {
+            /* In sync: adjust clock by a fraction of the phase mismatch. */
+            clock += ticks * pll_period_adj;
+        } else {
+            /* Out of sync: adjust clock towards centre. */
+            clock += (clock_centre - clock) * pll_period_adj;
+        }
+        /* Clamp the clock's adjustment range. */
+        if (clock < clock_min)
+            clock = clock_min;
+        else if (clock > clock_max)
+            clock = clock_max;
+        /* PLL: Adjust clock phase according to mismatch. */
+        new_ticks = ticks * (1.0 - pll_phase_adj);
+        if (PyList_SetItem(time_array, PyList_Size(time_array)-1,
+                           PyFloat_FromDouble(ticks - new_ticks)) < 0)
+            return NULL;
+        ticks = new_ticks;
+
+    }
+
+    Py_RETURN_NONE;
+}
+
+
+static int _read_28bit(uint8_t *p)
+{
+    int x;
+    x  = (p[0]       ) >>  1;
+    x |= (p[1] & 0xfe) <<  6;
+    x |= (p[2] & 0xfe) << 13;
+    x |= (p[3] & 0xfe) << 20;
+    return x;
+}
+
+static PyObject *
+decode_flux(PyObject *self, PyObject *args)
+{
+    /* Parameters */
+    Py_buffer bytearray;
+    PyObject *res = NULL;
+
+    /* bytearray buffer */
+    uint8_t *p;
+    Py_ssize_t l;
+
+    /* Local variables */
+    PyObject *flux, *index;
+    long val, ticks, ticks_since_index;
+    int i, opcode;
+
+    if (!PyArg_ParseTuple(args, "y*", &bytearray))
+        return NULL;
+    p = bytearray.buf;
+    l = bytearray.len;
+
+    /* assert dat[-1] == 0 */
+    if ((l == 0) || (p[l-1] != 0)) {
+        PyErr_SetString(PyExc_ValueError, "Flux is not NUL-terminated");
+        PyBuffer_Release(&bytearray);
+        return NULL;
+    }
+    /* len(dat) -= 1 */
+    l -= 1;
+
+    /* flux, index = [], [] */
+    flux = PyList_New(0);
+    index = PyList_New(0);
+    /* ticks, ticks_since_index = 0, 0 */
+    ticks = 0;
+    ticks_since_index = 0;
+
+    while (l != 0) {
+        i = *p++;
+        if (i == 255) {
+            if ((l -= 2) < 0)
+                goto oos;
+            opcode = *p++;
+            switch (opcode) {
+            case FLUXOP_INDEX:
+                if ((l -= 4) < 0)
+                    goto oos;
+                val = _read_28bit(p);
+                p += 4;
+                if (PyList_Append(index, PyLong_FromLong(
+                                      ticks_since_index + ticks + val)) < 0)
+                    goto out;
+                ticks_since_index = -(ticks + val);
+                break;
+            case FLUXOP_SPACE:
+                if ((l -= 4) < 0)
+                    goto oos;
+                ticks += _read_28bit(p);
+                p += 4;
+                break;
+            default:
+                PyErr_Format(PyExc_ValueError,
+                             "Bad opcode in flux stream (%d)", opcode);
+                goto out;
+            }
+        } else {
+            if (i < 250) {
+                l -= 1;
+                val = i;
+            } else {
+                if ((l -= 2) < 0)
+                    goto oos;
+                val = 250 + (i - 250) * 255;
+                val += *p++ - 1;
+            }
+            ticks += val;
+            if (PyList_Append(flux, PyLong_FromLong(ticks)) < 0)
+                goto out;
+            ticks_since_index += ticks;
+            ticks = 0;
+        }
+    }
+
+    res = Py_BuildValue("OO", flux, index);
+
+out:
+    PyBuffer_Release(&bytearray);
+    Py_DECREF(flux);
+    Py_DECREF(index);
+    return res;
+
+oos:
+    PyErr_SetString(PyExc_ValueError, "Unexpected end of flux");
+    goto out;
+}
+
+
+static PyMethodDef modulefuncs[] = {
+    { "flux_to_bitcells", flux_to_bitcells, METH_VARARGS, NULL },
+    { "decode_flux", decode_flux, METH_VARARGS, NULL },
+    { NULL }
+};
+
+static PyModuleDef moduledef = {
+    PyModuleDef_HEAD_INIT, "optimised", 0, -1, modulefuncs,
+};
+
+PyMODINIT_FUNC PyInit_optimised(void)
+{
+    append_s = Py_BuildValue("s", "append");
+    return PyModule_Create(&moduledef);
+}

+ 6 - 0
scripts/c_ext/setup.py

@@ -0,0 +1,6 @@
+from distutils.core import setup, Extension
+
+module1 = Extension('optimised', sources = ['optimised.c'])
+
+setup(name = 'optimised',
+      ext_modules = [module1])

+ 25 - 0
scripts/greaseweazle/optimised/__init__.py

@@ -0,0 +1,25 @@
+# greaseweazle/optimised/__init__.py
+#
+# Written & released by Keir Fraser <keir.xen@gmail.com>
+#
+# This is free and unencumbered software released into the public domain.
+# See the file COPYING for more details, or visit <http://unlicense.org>.
+
+import os
+
+gw_opt = os.environ.get('GW_OPT')
+enabled = gw_opt is None or gw_opt.lower().startswith('y')
+if enabled:
+    try:
+        from .optimised import *
+    except ModuleNotFoundError:
+        enabled = False
+        print('*** WARNING: Optimised data routines not found: '
+              'Run scripts/setup.sh')
+else:
+    print('*** WARNING: Optimised data routines disabled (GW_OPT=%s)'
+          % gw_opt)
+
+# Local variables:
+# python-indent: 4
+# End:

+ 66 - 50
scripts/greaseweazle/track.py

@@ -9,6 +9,7 @@ import binascii
 import itertools as it
 from bitarray import bitarray
 from greaseweazle.flux import WriteoutFlux
+from greaseweazle import optimised
 
 # A pristine representation of a track, from a codec and/or a perfect image.
 class MasterTrack:
@@ -205,65 +206,80 @@ class RawTrack:
         clock = self.clock
         clock_min = self.clock * (1 - self.clock_max_adj)
         clock_max = self.clock * (1 + self.clock_max_adj)
-        ticks = 0.0
 
         index_iter = it.chain(iter(map(lambda x: x/freq, flux.index_list)),
                               [float('inf')])
 
-        bits, times = bitarray(endian='big'), []
-        to_index = next(index_iter)
-
         # Make sure there's enough time in the flux list to cover all
         # revolutions by appending a "large enough" final flux value.
         tail = max(0, sum(flux.index_list) - sum(flux.list) + clock*freq*2)
-        for x in it.chain(flux.list, [tail]):
-
-            # Gather enough ticks to generate at least one bitcell.
-            ticks += x / freq
-            if ticks < clock/2:
-                continue
-
-            # Clock out zero or more 0s, followed by a 1.
-            zeros = 0
-            while True:
-
-                # Check if we cross the index mark.
-                to_index -= clock
-                if to_index < 0:
-                    self.bitarray += bits
-                    self.timearray += times
-                    self.revolutions.append(len(times))
-                    assert len(times) == len(bits)
-                    to_index += next(index_iter)
-                    bits, times = bitarray(endian='big'), []
-
-                ticks -= clock
-                times.append(clock)
-                if ticks >= clock/2:
-                    zeros += 1
-                    bits.append(False)
-                else:
-                    bits.append(True)
-                    break
-
-            # PLL: Adjust clock frequency according to phase mismatch.
-            if zeros <= 3:
-                # In sync: adjust clock by a fraction of the phase mismatch.
-                clock += ticks * self.pll_period_adj
+        flux_iter = it.chain(flux.list, [tail])
+
+        try:
+            optimised.flux_to_bitcells(
+                self.bitarray, self.timearray, self.revolutions,
+                index_iter, flux_iter,
+                freq, clock, clock_min, clock_max,
+                self.pll_period_adj, self.pll_phase_adj)
+        except AttributeError:
+            flux_to_bitcells(
+                self.bitarray, self.timearray, self.revolutions,
+                index_iter, flux_iter,
+                freq, clock, clock_min, clock_max,
+                self.pll_period_adj, self.pll_phase_adj)
+
+            
+def flux_to_bitcells(bit_array, time_array, revolutions,
+                     index_iter, flux_iter,
+                     freq, clock_centre, clock_min, clock_max,
+                     pll_period_adj, pll_phase_adj):
+
+    nbits = 0
+    ticks = 0.0
+    clock = clock_centre
+    to_index = next(index_iter)
+
+    for x in flux_iter:
+
+        # Gather enough ticks to generate at least one bitcell.
+        ticks += x / freq
+        if ticks < clock/2:
+            continue
+
+        # Clock out zero or more 0s, followed by a 1.
+        zeros = 0
+        while True:
+
+            # Check if we cross the index mark.
+            to_index -= clock
+            if to_index < 0:
+                revolutions.append(nbits)
+                nbits = 0
+                to_index += next(index_iter)
+
+            nbits += 1
+            ticks -= clock
+            time_array.append(clock)
+            if ticks >= clock/2:
+                zeros += 1
+                bit_array.append(False)
             else:
-                # Out of sync: adjust clock towards centre.
-                clock += (self.clock - clock) * self.pll_period_adj
-            # Clamp the clock's adjustment range.
-            clock = min(max(clock, clock_min), clock_max)
-            # PLL: Adjust clock phase according to mismatch.
-            new_ticks = ticks * (1 - self.pll_phase_adj)
-            times[-1] += ticks - new_ticks
-            ticks = new_ticks
-
-        # Append trailing bits.
-        self.bitarray += bits
-        self.timearray += times
+                bit_array.append(True)
+                break
 
+        # PLL: Adjust clock frequency according to phase mismatch.
+        if zeros <= 3:
+            # In sync: adjust clock by a fraction of the phase mismatch.
+            clock += ticks * pll_period_adj
+        else:
+            # Out of sync: adjust clock towards centre.
+            clock += (clock_centre - clock) * pll_period_adj
+        # Clamp the clock's adjustment range.
+        clock = min(max(clock, clock_min), clock_max)
+        # PLL: Adjust clock phase according to mismatch.
+        new_ticks = ticks * (1 - pll_phase_adj)
+        time_array[-1] += ticks - new_ticks
+        ticks = new_ticks
 
 # Local variables:
 # python-indent: 4

+ 6 - 2
scripts/greaseweazle/usb.py

@@ -10,6 +10,7 @@ import itertools as it
 from greaseweazle import version
 from greaseweazle import error
 from greaseweazle.flux import Flux
+from greaseweazle import optimised
 
 ## Control-Path command set
 class ControlCmd:
@@ -391,8 +392,11 @@ class Unit:
                 # Success!
                 break
 
-        # Decode the flux list and read the index-times list.
-        flux_list, index_list = self._decode_flux(dat)
+        try:
+            # Decode the flux list and read the index-times list.
+            flux_list, index_list = optimised.decode_flux(dat)
+        except AttributeError:
+            flux_list, index_list = self._decode_flux(dat)
 
         # Success: Return the requested full index-to-index revolutions.
         return Flux(index_list, flux_list, self.sample_freq, index_cued=False)

+ 4 - 0
scripts/setup.sh

@@ -0,0 +1,4 @@
+#!/bin/bash
+PYTHON="${PYTHON:-python3}"
+$PYTHON -m pip install --user bitarray crcmod pyserial
+(cd ./scripts/c_ext && $PYTHON setup.py install --install-platlib=../greaseweazle/optimised)