Java – extract a sequence of bits of any length from the byte [] array

I am looking for the most efficient way to extract a sequence of (unsigned) bits of any length (0 < = length = 16) at any location The framework class shows how my current implementation handles the problem:

public abstract class BitArray {

byte[] bytes = new byte[2048];
int bitGet;

public BitArray() {
}

public void readNextBlock(int initialBitGet,int count) {
    // substitute for reading from an input stream 
    for (int i=(initialBitGet>>3); i<=count; ++i) {
        bytes[i] = (byte) i;
    }
    prepareBitGet(initialBitGet,count);
}

public abstract void prepareBitGet(int initialBitGet,int count);

public abstract int getBits(int count);

static class Version0 extends BitArray {
    public void prepareBitGet(int initialBitGet,int count) {
        bitGet = initialBitGet;
    }

    public int getBits(int len) {
        // intentionally gives meaningless result
        bitGet += len;
        return 0;
    }
}

static class Version1 extends BitArray {
    public void prepareBitGet(int initialBitGet,int count) {
        bitGet = initialBitGet - 1;
    }

    public int getBits(int len) {
        int byteIndex = bitGet;
        bitGet = byteIndex + len;
        int shift = 23 - (byteIndex & 7) - len;
        int mask = (1 << len) - 1;
        byteIndex >>= 3;
        return (((bytes[byteIndex] << 16) | 
               ((bytes[++byteIndex] & 0xFF) <<  8) |
                (bytes[++byteIndex] & 0xFF)) >> shift) & mask;
    }
}

static class Version2 extends BitArray {
    static final int[] mask = { 0x0,0x1,0x3,0x7,0xF,0x1F,0x3F,0x7F,0xFF,0x1FF,0x3FF,0x7FF,0xFFF,0x1FFF,0x3FFF,0x7FFF,0xFFFF };

    public void prepareBitGet(int initialBitGet,int count) {
        bitGet = initialBitGet;
    }

    public int getBits(int len) {
        int offset = bitGet;
        bitGet = offset + len;
        int byteIndex = offset >> 3; // originally used /8
        int bitIndex = offset & 7;   // originally used %8
        if ((bitIndex + len) > 16) {
            return ((bytes[byteIndex] << 16 |
                    (bytes[byteIndex + 1] & 0xFF) << 8 |
                    (bytes[byteIndex + 2] & 0xFF)) >> (24 - bitIndex - len)) & mask[len];
        } else if ((offset + len) > 8) {
            return ((bytes[byteIndex] << 8 |
                    (bytes[byteIndex + 1] & 0xFF)) >> (16 - bitIndex - len)) & mask[len];
        } else {
            return (bytes[byteIndex] >> (8 - offset - len)) & mask[len];
        }
    }
}

static class Version3 extends BitArray {
    int[] ints = new int[2048];

    public void prepareBitGet(int initialBitGet,int count) {
        bitGet = initialBitGet;
        int put_i = (initialBitGet >> 3) - 1;
        int get_i = put_i;
        int buf;
        buf = ((bytes[++get_i] & 0xFF) << 16) |
              ((bytes[++get_i] & 0xFF) <<  8) |
               (bytes[++get_i] & 0xFF);
        do {
            buf = (buf << 8) | (bytes[++get_i] & 0xFF);
            ints[++put_i] = buf;
        } while (get_i < count);
    }

    public int getBits(int len) {
        int bit_idx = bitGet;
        bitGet = bit_idx + len;
        int shift = 32 - (bit_idx & 7) - len;
        int mask = (1 << len) - 1;
        int int_idx = bit_idx >> 3;
        return (ints[int_idx] >> shift) & mask;
    }
}

static class Version4 extends BitArray {
    int[] ints = new int[1024];

    public void prepareBitGet(int initialBitGet,int count) {
        bitGet = initialBitGet;
        int g = initialBitGet >> 3;
        int p = (initialBitGet >> 4) - 1;
        final byte[] b = bytes;
        int t = (b[g]  <<  8) | (b[++g] & 0xFF);
        final int[] i = ints;
        do {
            i[++p] = (t = (t << 16) | ((b[++g] & 0xFF) <<8) | (b[++g] & 0xFF));
        } while (g < count);
    }

    public int getBits(final int len) {
        final int i;
        bitGet = (i = bitGet) + len;
        return (ints[i >> 4] >> (32 - len - (i & 15))) & ((1 << len) - 1);
    }
}

public void benchmark(String label) {
    int checksum = 0;
    readNextBlock(32,1927);
    long time = System.nanoTime();
    for (int pass=1<<18; pass>0; --pass) {
        prepareBitGet(32,1927);
        for (int i=2047; i>=0; --i) {
            checksum += getBits(i & 15);
        }
    }
    time = System.nanoTime() - time;
    System.out.println(label+" took "+Math.round(time/1E6D)+" ms,checksum="+checksum);
    try { // avoid having the console interfere with our next measurement
        Thread.sleep(369);
    } catch (InterruptedException e) {}
}

public static void main(String[] argv) {
    BitArray test;
    // for the sake of getting a little less influence from the OS for stable measurement
    Thread.currentThread().setPriority(Thread.MAX_PRIORITY);
    while (true) {
        test = new Version0();
        test.benchmark("no implementaion");
        test = new Version1();
        test.benchmark("Durandal's (original)");
        test = new Version2();
        test.benchmark("blitzpasta's (adapted)");
        test = new Version3();
        test.benchmark("MSN's (posted)");
        test = new Version4();
        test.benchmark("MSN's (half-buffer modification)");
        System.out.println("--- next pass ---");
    }
}
}

This is effective, but I am looking for a more effective solution (performance wise) The byte array is guaranteed to be relatively small, with a maximum of 1800 bytes between a few bytes The array is read completely once (completely) between each call to the read method There is no need to do any error checking in getbits (), such as out of array, etc

It seems that my preliminary questions above are not clear enough N-bit "bit sequence" forms n-bit integers. I need to extract these integers with minimal overhead I don't use strings because values are used as lookup indexes or feed some calculations directly So basically, the framework shown above is a real class, and the getbits () signature shows how the rest of the code interacts with it

Extend the sample code to the micro benchmark, including blitz pasta's solution (fixed missing byte masking) In my old amd box, the result is ~ 11400ms vs ~ 38000ms FYI: its partitioning and simulation operations kill performance If you replace > > 3 and% 8 with & 8, & 7, the two solutions are very close to each other (jdk1.7.0ea104)

There seems to be a bit of confusion about how and what work The first original post of the sample code includes a read () method to indicate where and when the byte buffer is filled This is lost when the code becomes a micro rack I reintroduced it to make it clearer The idea is to beat all existing versions by adding another subclass of bitarray that needs to implement getbits() and preparebitget(), which may be empty Don't change the benchmark to make your solution have advantages. All existing solutions can complete the same operation, so as to make it a perfect optimization! (really!!)

I added a version 0, which will only increase the bitget status Always return 0 to see how expensive the benchmark is It only compares there

In addition, an adaptation to the idea of MSN (version 3) is added In order to maintain fairness and comparability among all competitors, byte array filling is now part of the benchmark and the preparation steps (see above) The original MSN solution was not so good, and there was a lot of overhead in preparing the int [] buffer I freely optimized one step, which became a fierce competitor:) you may also find that I have removed some of your code Your getbit () may be reduced to 3 columns, which may be reduced by one or two percentage points I do this deliberately to keep the code readable, because other versions are not as small as possible (again, readable)

Conclusion (the code example update above includes versions based on all applicable contributions) On my old amd box (Sun JRE 1.6.0_21), they came out:

V0 did not implement 5384 Ms. V1 Durandal (original) took 10283 Ms. V2 blitzpasta (adaptation) took 12212 Ms. V3 MSN (release) took 11030 Ms. V4 MSN (half buffer modification) took 9700 Ms

Note: in this benchmark, the average call to getbits () gets 7.5 bits, and each bit is read only once Since V3 / V4 must pay higher initialization costs, they tend to show better runtime behavior with shorter fetch times (so the maximum value closer to the average fetch size is 16) Nevertheless, V4 remains ahead of all other options in all cases In practical applications, cache contention must be considered because the additional space required by V3 / V4 may increase cache misses to V0, which is a better choice If the array is traversed more than once, V4 should be advantageous because it is faster than others, and expensive initialization is allocated after the first pass

Solution

Then, according to the distance between the drop time you want and the memory seesaw, you can allocate a 32-bit side table for every 16 bit offset, and then offset according to the 16 bits:

byte[] bytes = new byte[2048];   
int bitGet;   
unsigned int dwords[] = new unsigned int[2046];

public BitArray() {   
    for (int i=0; i<bytes.length; ++i) {   
        bytes[i] = (byte) i;   
    }   

    for (int i= 0; i<dwords.length; ++i) {
        dwords[i]= 
            (bytes[i    ] << 24) | 
            (bytes[i + 1] << 16) | 
            (bytes[i + 2] <<  8) | 
            (bytes[i + 3]);
    }
}   

int getBits(int len)
{
    int offset= bitGet;
    int offset_index= offset>>4;
    int offset_offset= offset & 15;

    return (dwords[offset_index] >> offset_offset) & ((1 << len) - 1);
}

You can avoid branching (at the cost of four times your memory) And find masks just faster than (1 < len) - 1?

The content of this article comes from the network collection of netizens. It is used as a learning reference. The copyright belongs to the original author.
THE END
分享
二维码
< <上一篇
下一篇>>