From cec96a3debb334f991c3e5b41fea5b2a44c1bde7 Mon Sep 17 00:00:00 2001 From: Cosmin SAVEANU <cosmin.saveanu@pasteur.fr> Date: Wed, 14 Oct 2020 13:59:54 +0200 Subject: [PATCH] Update Get rows py3/Gt_rws_with_list.py --- Get rows py3/Gt_rws_with_list.py | 279 +++++++++++++++++++++++++++++++ 1 file changed, 279 insertions(+) create mode 100644 Get rows py3/Gt_rws_with_list.py diff --git a/Get rows py3/Gt_rws_with_list.py b/Get rows py3/Gt_rws_with_list.py new file mode 100644 index 0000000..f62e667 --- /dev/null +++ b/Get rows py3/Gt_rws_with_list.py @@ -0,0 +1,279 @@ +""" +This script allows the user to obtain +rows from a tab delimited file in which +one of the elements in a column matches +exactly one element from a list of elements +(get the description of every ORF from +a list of ORFs for example). +A line by line filter version should be +used for very large files; beware. + +Usage: + +--tabfile, -t = description file (tab delimited large file) +--tobematched, -i = file with elements to be mathched against + description file items +--help, -h = this help message + +""" + +import os, sys, getopt + +class TabData: + """a tab separated data object class""" + def __init__(self): + self.nmblines = 0 + self.data = [] + + def read_from_file(self, filename): + # input data from a tab delimited file + self.filename = filename + f=open(filename, 'r') + for line in f: + self.data.append(line.strip('\n\r').split('\t')) + f.close() + self.nmblines = len(self.data) + + def first_line(self): + result = "" + for item in self.data[0]: + result=result+item+' ' + return result + + def number_of_columns(self): + return len(self.data[0]) + + def get_row(self, number): + return self.data[number] + + def get_row_by_name(self, row_name): + """row by name and error code 0""" + r_name = str(row_name) + control = 0 + row_found = [] + for i in range(self.nmblines): + if r_name == self.data[i][0]: + for j in range (len(self.data[i])): + row_found.append(self.data[i][j]) + control = 1 + break + return row_found, control + + def get_row_by_colname(self, name, colnmb): + """ + arguments : name - name to be matched, colnmb - position + returns a tuple containing the row and 1 if OK or 0 if problem + """ + search_name = name.upper() + control = 0 + row_found = [] + for i in range(self.nmblines): + if (colnmb < len(self.data[i])): + if search_name == self.data[i][colnmb]: + row_found=self.data[i] + control = 1 + break + return row_found, control + + def get_row_withindex(self, name, column): + """ + arguments : name - name to be matched, colum - obtained by get_column + returns a tuple containing the row and 1 if OK or 0 if problem + """ + search_name = name.upper() + control = 0 + row_found = [] + try: + idx = column.index(name, 0, len(column)) + row_found = self.data[idx] + control = 1 + except ValueError: + control = 0 + return row_found, control + + def get_column(self, number): + nmb = int(number) + col_found = [] + for i in range(self.nmblines): + col_found.append(self.data[i][nmb]) + return col_found + + def get_column_by_name(self, col_name): + c_name = str(col_name) + c_nmb = 0 + control = 0 + for i in range(len(self.data[0])): + if c_name == self.data[0][i]: + c_nmb = i + control = 1 + break + return self.get_column(c_nmb), control + + def del_row(self, number): + del self.data[number] + + def del_column(self, number): + """delete column by number""" + nmblines = int(self.nmblines) + for i in range(nmblines): + del self.data[i][number] + + def del_column_by_name(self, column_name): + """delete column by name + + accepts a string as the column name + if no such column found, returns the original matrix + + """ + number = 0 + col_name = str(column_name) + for i in range(len(self.data[0])): + if col_name == self.data[0][i]: + number = i + if number == 0: + return self.data + else: + return self.del_column(number) + + def del_row_by_name(self, row_name): + """delete row by name + + accepts a string as the row name + if no such row found, returns the original matrix + + """ + number = 0 + r_name = str(row_name) + for i in range(self.nmblines): + if r_name == self.data[i][0]: + number = i + if number == 0: + return self.data + else: + return self.del_row(number) + + def numb_of_lines(self): + return len(self.data) + + def __str__(self): + return self.data + + def extract_column(self, column_name): + """returns a list corresponding to column""" + result_list = [] + for i in range(len(self.data)): + for j in range (len(self.data[i])): + if self.data[0][j]==column_name: + result_list.append(self.data[i][j]) + return result_list + + def transpose(self): + """transpose matrix""" + transposed = [] + for i in range(len(self.data[0])): + int_list = [] + for j in range(len(self.data)): + int_list.append(self.data[j][i]) + transposed.append(int_list) + return transposed + + + def transpose_z(self): + """transpose matrix using zip""" + return list(zip(*self)) +#_________________________________ +#main + +def write_matrix(matrix, file_handle): + for line in matrix: + for item in line: + file_handle.write(item) + file_handle.write('\t') + file_handle.write('\n') +#__________________________________ + +class Usage(Exception): + def __init__(self, msg): + self.msg = msg + +def main(argv=None): + global tabfile_fname, tobematched_fname + tabfile_fname, tobematched_fname = "", "" + if argv is None: + argv = sys.argv + try: + + try: + opts, args = getopt.getopt(argv[1:], "ht:i:", ["help", "tabfile=", "tobematched="]) + except getopt.error as msg: + raise Usage(msg) + # option processing + for option, value in opts: + if option in ("-h", "--help"): + raise Usage(__doc__) + if option in ("-t", "--tabfile"): + tabfile_fname= value + if option in ("-i", "--tobematched"): + tobematched_fname = value + + if (tabfile_fname == "" or tobematched_fname == ""): + raise Usage("no input filenames!") + sys.exit(2) + else: + try: + #read data from tab file in TabData object + file_info = os.stat(tabfile_fname) + print("Reading ", tabfile_fname, ' of ' , (file_info[6]/1024), ' kBytes .......... \n') + print("The results will be written to a file called matchedrows.out") + + properties = TabData() + properties.read_from_file(tabfile_fname) + + #read data from list of items in a list + file = open(tobematched_fname, 'r') + item_list = [] + for line in file: + item_list.append(line.strip('\r\n')) + file.close() + + print('Your description file has', properties.number_of_columns(), 'columns') + print('The first line of the file says:') + print(properties.first_line()) + print('Enter the number of the column used for match, (1 to '+str(properties.number_of_columns())+'):\n') + match_colnumber = int(input()) + + #output file path + path_toht = os.path.split(tobematched_fname) + path_to = path_toht[0] + output_filename = os.path.join(path_to,"matchedrows.out") + + result_data = [] + visual_counter = 0 + indexed_column = properties.get_column(match_colnumber -1) + for item in item_list: + int_result = properties.get_row_withindex(item, + indexed_column) + if int_result[1]==1: #check for return + result_data.append(int_result[0]) + else: + result_data.append([item,""]) + print("Item", item, "not found") + visual_counter+=1 + print(visual_counter, item) + out_file = open(output_filename, "w") + write_matrix(result_data, out_file) + out_file.close() + finally: + pass + + except Usage as err: + print(sys.argv[0].split("/")[-1] + ": " + str(err.msg), file=sys.stderr) + print("\t for help use --help", file=sys.stderr) + return 2 + +if __name__ == "__main__": + sys.exit(main()) + + + + -- GitLab