Python fails to recognize a digit

I have an input file such as this and the program removes everything but the hindi text.

1
00:00:10,240 --> 00:00:13,824
विकास नाम का एक गरीब मजदूर था

2
00:00:14,592 --> 00:00:15,360
जो सेठ

3
00:00:15,616 --> 00:00:16,896
भीमसेन के यहां

Here is my program

#!/usr/bin/python
# -*- coding:utf-8 -*-

import sys
import re
import string
import codecs

def del_brackets(s):
        a = re.compile(r'<.*?>')
        result = a.sub('', s)
        return result.strip('n').strip()

def del_brackets2(s):
        a = re.compile(r'[.*?]')
        result = a.sub('', s)
        return result.strip('n').strip()

def del_brackets3(s):
        a = re.compile(r'{.*?}')
        result = a.sub('', s)
        return result.strip('n').strip()

def del_brackets4(s):
        a = re.compile(r'(.*?)')
        result = a.sub('', s)
        return result.strip('n').strip()

with open(sys.argv[1], 'r') as f:
    lines = f.readlines()

outfile = open(sys.argv[1].replace('.srt', '.txt'), 'w')

exclude = set('♪"#$%&()*+-/:<=>@[\]^_`{|}')
for line in lines:
#   print(repr(line))
    line = line.strip()
    #line = unicode(line.strip('n'), 'utf-8')
    if len(line.strip()) != 0 and line != 1 and line != "1":
        if (not line.isdigit()) and ('-->' not in line):
            line = del_brackets(line)
            line = del_brackets2(line)
            line = del_brackets3(line)
            line = del_brackets4(line)
            line = ' '.join(''.join(' ' if ch in exclude else ch for ch in line).split())
            line = re.sub(r'...', ' ', line)
            outfile.write(line.lstrip() + "n")

outfile.close()

and the expected output is below

विकास नाम का एक गरीब मजदूर था
जो सेठ
भीमसेन के यहां

However, my program doesn’t recognize the first line digit, and instead it returns

1
विकास नाम का एक गरीब मजदूर था
जो सेठ
भीमसेन के यहां

Why does this program doesn’t recognize the digit when I specifically wrote 1 or “1”?

Answer

Using regex we can create a simple expression that covers the three cases that you want to ignore:

timestamp line
number line
empty line

From there we can use python’s built-in filter method to filter out all of the undesired lines, and use the filter results as the lines to write.

import sys, re

def pruneSRTtoTXT(fn):
    fn2    = fn.replace('.srt', '.txt')
    stamp  = '[0-9]{2}:[0-9]{2}:[0-9]{2},[0-9]{3}'
    ignore = re.compile(f'^({stamp}s-->s{stamp}|[0-9]+|[rn]+)$', re.M)

    with open(fn, 'r') as f, open(fn2, 'w') as f2:
        f2.writelines(filter(lambda l: not ignore.search(l), f.readlines()))

pruneSRTtoTXT(sys.argv[1])

Advertisement

Answer