(* Define the bit patterns for each nucleotide *)
nucleotideToBit = <|"A" -> "00", "C" -> "01", "G" -> "10", "T" -> "11"|>;
bitToNucleotide = <|"00" -> "A", "01" -> "C", "10" -> "G", "11" -> "T"|>;
(* Function to encode a DNA sequence into a ByteArray *)
encodeDNA[sequence_String] := Module[{bitString, paddedBitString, byteList},
bitString = StringJoin[sequence /. nucleotideToBit];
(* Pad the bitString to make it a multiple of 8 bits (if necessary) *)
paddedBitString = StringPadRight[bitString, Ceiling[StringLength[bitString]/8]*8, "0"];
(* Convert the padded bit string to a list of bytes *)
byteList = FromDigits[#, 2] & /@ StringPartition[paddedBitString, 8];
(* Return the ByteArray *)
ByteArray[byteList]
]
(* Function to decode a ByteArray back into a DNA sequence *)
decodeDNA[byteArray_ByteArray] := Module[{bitString, nucleotideList, decodedSequence},
bitString = StringJoin[IntegerString[#, 2, 8] & /@ Normal[byteArray]];
nucleotideList = StringPartition[bitString, 2];
decodedSequence = StringJoin[nucleotideList /. bitToNucleotide];
decodedSequence
]
(* Function to convert a DNA string pattern to its bitwise integer representation *)
dnaPatternToBits[pattern_String] := StringJoin[pattern /. nucleotideToBit]
(* Function to match bits in the ByteArray with the pattern *)
matchBits[byteArray_ByteArray, startBit_, patternBits_String] := Module[
{bitString, bitSegment},
bitString = StringJoin[IntegerString[#, 2, 8] & /@ Normal[byteArray]];
bitSegment = StringTake[bitString, {startBit + 1, startBit + StringLength[patternBits]}];
bitSegment === patternBits
]
(* Function to search for the pattern using optimized bitwise comparison *)
searchDNA[byteArray_ByteArray, patternPrefix_String, patternSuffix_String, wildcardNucleotides_] := Module[
{bitLength, prefixBits, suffixBits, prefixLength, suffixLength, wildcardLength, matches = {}, i},
bitLength = Length[byteArray] * 8;
prefixBits = dnaPatternToBits[patternPrefix];
suffixBits = dnaPatternToBits[patternSuffix];
prefixLength = StringLength[prefixBits];
suffixLength = StringLength[suffixBits];
wildcardLength = wildcardNucleotides * 2; (* Convert nucleotides to bits *)
For[i = 0, i <= bitLength - (prefixLength + wildcardLength + suffixLength), i += 2,
If[
matchBits[byteArray, i, prefixBits] &&
matchBits[byteArray, i + prefixLength + wildcardLength, suffixBits],
AppendTo[matches, i/2] (* Convert bit position to nucleotide position *)
]
];
matches
]
(* Function to extract the actual wildcard strings found between the prefix and suffix *)
extractWildcards[byteArray_ByteArray, matches_List, prefixLength_Integer, wildcardNucleotides_Integer] := Module[
{wildcards = {}, wildcardLength, i, match, bitString, wildcardBits, nucleotideList, wildcardDNA},
wildcardLength = wildcardNucleotides * 2; (* Convert nucleotides to bits *)
For[i = 1, i <= Length[matches], i++,
match = matches[[i]];
bitString = StringJoin[IntegerString[#, 2, 8] & /@ Normal[byteArray]];
wildcardBits = StringTake[bitString, {match * 2 + prefixLength + 1, match * 2 + prefixLength + wildcardLength}];
(* Convert the bit string back to a DNA string *)
nucleotideList = StringPartition[wildcardBits, 2];
wildcardDNA = StringJoin[nucleotideList /. bitToNucleotide];
AppendTo[wildcards, wildcardDNA];
];
wildcards
]
(* Example usage *)
dnaString = "AACAAAAAAAAAAAAAAAAAAAAGG";
bitEncodedDNA = encodeDNA[dnaString];
decodedDNA = decodeDNA[bitEncodedDNA];
Print["Decoded DNA Sequence: ", decodedDNA];
patternPrefix = "CAA";
patternSuffix = "AG";
wildcardNucleotides = 17;
matches = searchDNA[bitEncodedDNA, patternPrefix, patternSuffix, wildcardNucleotides];
Print["Matches found at nucleotide positions: ", matches];
wildcards = extractWildcards[bitEncodedDNA, matches, StringLength[patternPrefix] * 2, wildcardNucleotides];
Print["Wildcard sequences found: ", wildcards];