As a side project I was trying to understand closed caption encoding, and wrote a simple function to encode strings into SCC compliant data. It works, but I've rewritten it twice trying to reduce it and it comes out about the same every time. Granted, I suck at developing algorithms, so no need to state it. Any suggestions is appreciated.
Requirements:
# each line must be less than 28 chars (32 - 4)
# words should not be split on two lines. If a word exceeds the number of chars, start a new line
# must be grouped in two byte hex words, separated by a space
# if only one char left in two byte hex word, use '0x80' to complete the word
So, "This is sample closed caption text." would be converted to
5468 6973 2069 7320 7361 6d70 6c65 2063 6c6f 7365 6480 13f2 13f2 6361 7074 696f 6e20 7465 7874 2e80
13f2 13f2 is the code for a new line. Here's the code:
Requirements:
# each line must be less than 28 chars (32 - 4)
# words should not be split on two lines. If a word exceeds the number of chars, start a new line
# must be grouped in two byte hex words, separated by a space
# if only one char left in two byte hex word, use '0x80' to complete the word
So, "This is sample closed caption text." would be converted to
5468 6973 2069 7320 7361 6d70 6c65 2063 6c6f 7365 6480 13f2 13f2 6361 7074 696f 6e20 7465 7874 2e80
13f2 13f2 is the code for a new line. Here's the code:
Code:
import re, binascii
class ClosedCaption():
def __init__(self):
self.next_line = "13f2 13f2 " # Position cursor at row 13, column 04, with plain white text.
def strToSCC(self, s):
# each line must be less than 28 chars (32 - 4)
# words should not be split on two lines
# must be grouped in two byte hex words, separated by a space
# if only one char left in two byte hex word, use '0x80' to complete the word
new_str = "" # final converted string
two_byte_count = 0 #
print "*** two_byte_count = %d" % two_byte_count
chars = 1
word_index = 0
for word in s.split():
print 20 * "#"
print "(+w) new word \"%s\"" % word
word_str = ""
if word_index == len(s.split()) -1:
last_word = True
else:
last_word = False
if not last_word: # increase char count to account for addition of space to each word
word_length = len(word) + 1
chars += 1
else:
word_length = len(word)
if word_length > 27 - chars:
print "(+l) new line"
word_str += self.next_line
chars = 1
two_byte_count = 0
print "*** two_byte_count = %d" % two_byte_count
else:
print "(l) word chars = %d, chars remaining = %d" % (word_length, 28 - chars)
char_index = 0
for c in word:
if char_index == word_length:
last_char_in_word = True
else:
last_char_in_word = False
word_str += binascii.hexlify(c) # add character
two_byte_count += 1
if two_byte_count > 1:
print "(w)(c) add space \" \""
word_str += " "
two_byte_count = 0
print "*** two_byte_count = %d" % two_byte_count
chars += 1
char_index += 1
if not last_word:
print "(w)(e) add \"0x20\""
word_str += binascii.hexlify(" ")
two_byte_count += 1
if two_byte_count > 1 and not last_word:
print "(w) add space \" \""
word_str += " "
two_byte_count = 0
print "*** two_byte_count = %d" % two_byte_count
new_str += word_str
print "*** increase word_index to %d" % s.split().index(word)
print "(w) word = %s" % word_str
word_index += 1
if last_word: # end of word add 0x20, unless its the last word in the list
if two_byte_count == 1:
print "(w) add (0x80)"# add filler to unpaired character
new_str += "80"
two_byte_count = 0
print "*** two_byte_count = %d" % two_byte_count
print "Contents of new_str = \n%s" % new_str#re.sub("0x", "", new_str)
for word in new_str.split(): # debug
try:
print binascii.unhexlify(word)
except:
print "%s skipped" % word
return new_str
# start
test = ClosedCaption()
sample_str = "This is sample closed caption text."
test.strToSCC(sample_str)
Last edited: