Friday, February 24, 2017

Using regex to replace file data

Leave a Comment

With some help from here , I have this working almost exactly the way I want. Now I need to be able to add the ability to remove data from a file before the files are compared.

The reason for this is the strings, "data", that i'm removing is known to differ each time the file is saved.

I have written a regex to select the exact text that I want to remove, but I am having trouble implementing it with my current code.

Here are the three main functions

HOSTNAME_RE = re.compile(r'hostname +(\S+)') def get_file_info_from_lines(filename, file_lines):     hostname = None     a_hash = hashlib.sha1()     for line in file_lines:         a_hash.update(line.encode('utf-8'))         match = HOSTNAME_RE.match(line)         if match:             hostname = match.group(1)     return hostname, filename, a_hash.hexdigest()  def get_file_info(filename):     if filename.endswith(('.cfg', '.startup', '.confg')):         with open(filename, "r+") as in_file:             #filename = re.sub(REMOVE_RE, subst, filename, 0, re.MULTILINE)             return get_file_info_from_lines(filename, in_file.readlines())  def hostname_parse(directory):     results = {}     i = 0     l = len(os.listdir(directory))     for filename in os.listdir(directory):         filename = os.path.join(directory, filename)         sleep(0.001)         i += 1         progress_bar(i, l, prefix = 'Progress:', suffix = 'Complete', barLength = 50)         info = get_file_info(filename)         if info is not None:             results[info[0]] = info     return results 

This is the regex for finding the strings to be removed.

REMOVE_RE = r"((?:\bCurrent configuration)(?:.*\n?){6})" subst = "" 

EXAMPLE_FILE_BEFORE_DATA_REMOVED:

Building configuration...  Current configuration : 45617 bytes ! ! Last configuration change at 00:22:36 UTC Sun Jan 22 2017 by user ! NVRAM config last updated at 00:22:43 UTC Sun Jan 22 2017 by user ! version 15.0 no service pad ! no logging console enable secret 5 ***encrypted password*** ! username admin privilege 15 password 7 ***encrypted password*** username sadmin privilege 15 secret 5 ***encrypted password*** aaa new-model ! ip ftp username ***encrypted password*** ip ftp password 7 ***encrypted password*** ip ssh version 2 ! line con 0  password 7 ***encrypted password***  login authentication maint line vty 0 4  password 7 ***encrypted password***  length 0  transport input ssh line vty 5 15  password 7 ***encrypted password***  transport input ssh ! 

EXAMPLE_FILE_AFTER_DATA_REMOVED:

Building configuration...  ! no service pad ! no logging console enable  ! username admin privilege 15  username gisadmin privilege 15  aaa new-model ! ip ftp username cfgftp ip ftp  ip ssh version 2 ! line con 0   login authentication maint line vty 0 4   length 0  transport input ssh line vty 5 15   transport input ssh ! 

I've tried doing something like #filename = re.sub(REMOVE_RE, subst, filename, 0, re.MULTILINE) within the get_file_info and get_file_info_from_lines but I'm obviously not implementing it correctly.

Any help would be appreciated as I am just learning.

Running the Compare:

results1 = hostname_parse('test1.txt') results2 = hostname_parse('test2.txt')    for hostname, filename, filehash in results1.values():     if hostname in results2:         _, filename2, filehash2 = results2[hostname]         if filehash != filehash2:             print("%s has a change (%s, %s)" % (                 hostname, filehash, filehash2))             print(filename)             print(filename2)             print() 

I do not want to modify the current file. If all of this can be done in memory or a temporary file would be great.

FULL CODE:

import hashlib import os import re   HOSTNAME_RE = re.compile(r'hostname +(\S+)') REMOVE_RE = re.compile(r"((?:\bCurrent configuration)(?:.*\n?){6})")   def get_file_info_from_lines(filename, file_lines):     hostname = None     a_hash = hashlib.sha1()     for line in file_lines:         #match = HOSTNAME_RE.match(line)         if not re.match(REMOVE_RE, line):             a_hash.update(line.encode('utf-8'))         #=======================================================================         # if match:         #     hostname = match.group(1)         #=======================================================================     return hostname, filename, a_hash.hexdigest()  def get_file_info(filename):     if filename.endswith(('.cfg', '.startup', '.confg')):         with open(filename, "r+") as in_file:             return get_file_info_from_lines(filename, in_file.readlines())  def hostname_parse(directory):     results = {}     for filename in os.listdir(directory):         filename = os.path.join(directory, filename)         info = get_file_info(filename)         if info is not None:             results[info[0]] = info     return results   results1 = hostname_parse('test1') #Directory of test files results2 = hostname_parse('test2') #Directory of test files 2    for hostname, filename, filehash in results1.values():     if hostname in results2:         _, filename2, filehash2 = results2[hostname]         if filehash != filehash2:             print("%s has a change (%s, %s)" % (                 hostname, filehash, filehash2))             print(filename)             print(filename2)             print() 

2 Answers

Answers 1

In get_file_info_from_lines, simply ignore the line if it matches your regular expression. This way you don't need to actually modify the file or create another file, you simply calculate the hash with the lines that actually matter.

for line in file_lines:     if not re.match(REMOVE_RE, line):         a_hash.update(line.encode('utf-8')) 

Answers 2

Hi i suggest you to use the following apporach : Use a function to clean a line. The process lines to remove the empty ones.

Then use Difflib to compare. Use python -m doctest file.py to check doctest

import re source_content = """ Building configuration...  Current configuration : 45617 bytes ! ! Last configuration change at 00:22:36 UTC Sun Jan 22 2017 by user ! NVRAM config last updated at 00:22:43 UTC Sun Jan 22 2017 by user ! version 15.0 no service pad ! no logging console enable secret 5 ***encrypted password*** ! username admin privilege 15 password 7 ***encrypted password*** username sadmin privilege 15 secret 5 ***encrypted password*** aaa new-model ! ip ftp username ***encrypted password*** ip ftp password 7 ***encrypted password*** ip ssh version 2 ! line con 0  password 7 ***encrypted password***  login authentication maint line vty 0 4  password 7 ***encrypted password***  length 0  transport input ssh line vty 5 15  password 7 ***encrypted password***  transport input ssh ! """  target_content = """ Building configuration...  ! no service pad ! no logging console enable  ! username admin privilege 15  username gisadmin privilege 15  aaa new-model ! ip ftp username cfgftp ip ftp  ip ssh version 2 ! line con 0   login authentication maint line vty 0 4   length 0  transport input ssh line vty 5 15   transport input ssh ! """    HOSTNAME_RE = re.compile(r'hostname +(\S+)') REMOVE_RE = re.compile(r"((?:\bCurrent configuration)(?:.*\n?){6})")   def process_line(line):     """     >>> process_line('! rgrg')     '!'     >>> process_line('username admin privilege 15 password 7 ***encrypted password***')      """      if line.startswith('!'):         return '!'     if HOSTNAME_RE.match(line):         return match.group(1)     if REMOVE_RE.match(line):         return ''     return line  #debug for line in source_content.split('\n'):     print(repr(process_line(line).strip()))  whitened = '\n'.join(process_line(line).strip()                       for line in source_content.split('\n'))  def clean_lines(lines, flag=''):     """ Replaces multiple 'flag' lines by only one.      """     res = []     in_block = False     for line in lines:          if line.strip('\n') == flag:             if not in_block:                 res.append(line)                 in_block = True             continue         in_block = False         res.append(line)     return res  print('^^^^^^^^^^^^^^') no_exc = '\n'.join(clean_lines(whitened.split('\n'), flag='!')) print(no_exc) print('##############') no_sp = '\n'.join(clean_lines(no_exc.split('\n')))         print(no_sp)   
If You Enjoyed This, Take 5 Seconds To Share It

0 comments:

Post a Comment