From cec96a3debb334f991c3e5b41fea5b2a44c1bde7 Mon Sep 17 00:00:00 2001
From: Cosmin  SAVEANU <cosmin.saveanu@pasteur.fr>
Date: Wed, 14 Oct 2020 13:59:54 +0200
Subject: [PATCH] Update Get rows py3/Gt_rws_with_list.py

---
 Get rows py3/Gt_rws_with_list.py | 279 +++++++++++++++++++++++++++++++
 1 file changed, 279 insertions(+)
 create mode 100644 Get rows py3/Gt_rws_with_list.py

diff --git a/Get rows py3/Gt_rws_with_list.py b/Get rows py3/Gt_rws_with_list.py
new file mode 100644
index 0000000..f62e667
--- /dev/null
+++ b/Get rows py3/Gt_rws_with_list.py	
@@ -0,0 +1,279 @@
+"""
+This script allows the user to obtain
+rows from a tab delimited file in which
+one of the elements in a column matches
+exactly one element from a list of elements
+(get the description of every ORF from
+a list of ORFs for example).
+A line by line filter version should be
+used for very large files; beware.
+
+Usage:
+
+--tabfile, -t = description file (tab delimited large file)
+--tobematched, -i = file with elements to be mathched against 
+		description file items
+--help, -h = this help message
+
+"""
+
+import  os, sys, getopt
+
+class TabData:
+	"""a tab separated data object class"""
+	def __init__(self):
+		self.nmblines = 0
+		self.data = []
+
+	def read_from_file(self, filename):
+		# input data from a tab delimited file
+		self.filename = filename
+		f=open(filename, 'r')
+		for line in f:
+			self.data.append(line.strip('\n\r').split('\t'))
+		f.close()
+		self.nmblines = len(self.data)
+		
+	def first_line(self):
+		result = ""
+		for item in self.data[0]:
+			result=result+item+'    '
+		return result
+				
+	def number_of_columns(self):
+		return len(self.data[0])
+		
+	def get_row(self, number):
+		return self.data[number]
+		
+	def get_row_by_name(self, row_name):
+		"""row by name and error code 0"""
+		r_name = str(row_name)
+		control = 0
+		row_found = []
+		for i in range(self.nmblines):
+			if r_name == self.data[i][0]:
+				for j in range (len(self.data[i])):
+					row_found.append(self.data[i][j])
+				control = 1
+				break
+		return row_found, control
+		
+	def get_row_by_colname(self, name, colnmb):
+		"""
+		arguments : name - name to be matched, colnmb - position
+		returns a tuple containing the row and 1 if OK or 0 if problem
+		"""
+		search_name = name.upper()
+		control = 0
+		row_found = []
+		for i in range(self.nmblines):
+			if (colnmb < len(self.data[i])):
+				if search_name == self.data[i][colnmb]:
+					row_found=self.data[i]
+					control = 1
+					break
+		return row_found, control
+	
+	def get_row_withindex(self, name, column):
+		"""
+		arguments : name - name to be matched, colum - obtained by get_column
+		returns a tuple containing the row and 1 if OK or 0 if problem
+		"""
+		search_name = name.upper()
+		control = 0
+		row_found = []
+		try:
+			idx = column.index(name, 0, len(column))
+			row_found = self.data[idx]
+			control = 1
+		except ValueError:
+			control = 0
+		return row_found, control
+		
+	def get_column(self, number):
+		nmb = int(number)
+		col_found = []
+		for i in range(self.nmblines):
+			col_found.append(self.data[i][nmb])
+		return col_found
+		
+	def get_column_by_name(self, col_name):
+		c_name = str(col_name)
+		c_nmb = 0
+		control = 0
+		for i in range(len(self.data[0])):
+			if c_name == self.data[0][i]:
+				c_nmb = i
+				control = 1
+				break
+		return self.get_column(c_nmb), control		
+		
+	def del_row(self, number):
+		del self.data[number]
+		
+	def del_column(self, number):
+		"""delete column by number"""
+		nmblines = int(self.nmblines)
+		for i in range(nmblines):
+			del self.data[i][number]
+	
+	def del_column_by_name(self, column_name):
+		"""delete column by name
+		
+		accepts a string as the column name
+		if no such column found, returns the original matrix
+		
+		"""
+		number = 0
+		col_name = str(column_name)
+		for i in range(len(self.data[0])):
+			if col_name == self.data[0][i]:
+				number = i
+		if number == 0:
+			return self.data
+		else:
+			return self.del_column(number)
+			
+	def del_row_by_name(self, row_name):
+		"""delete row by name
+		
+		accepts a string as the row name
+		if no such row found, returns the original matrix
+		
+		"""
+		number = 0
+		r_name = str(row_name)
+		for i in range(self.nmblines):
+			if r_name == self.data[i][0]:
+				number = i
+		if number == 0:
+			return self.data
+		else:
+			return self.del_row(number)
+					
+	def numb_of_lines(self):
+		return len(self.data)
+		
+	def __str__(self):
+		return self.data
+		
+	def extract_column(self, column_name):
+		"""returns a list corresponding to column"""
+		result_list = []
+		for i in range(len(self.data)):
+			for j in range (len(self.data[i])):
+				if self.data[0][j]==column_name:
+					result_list.append(self.data[i][j])
+		return result_list
+		
+	def transpose(self):
+		"""transpose matrix"""
+		transposed = []
+		for i in range(len(self.data[0])):
+			int_list = []
+			for j in range(len(self.data)):
+				int_list.append(self.data[j][i])
+			transposed.append(int_list)
+		return transposed
+		
+	
+	def transpose_z(self):
+		"""transpose matrix using zip"""
+		return list(zip(*self))
+#_________________________________
+#main
+
+def write_matrix(matrix, file_handle):
+	for line in matrix:
+		for item in line:
+			file_handle.write(item)
+			file_handle.write('\t')
+		file_handle.write('\n')
+#__________________________________
+
+class Usage(Exception):
+    def __init__(self, msg):
+        self.msg = msg
+
+def main(argv=None):
+	global tabfile_fname, tobematched_fname
+	tabfile_fname, tobematched_fname = "", ""
+	if argv is None:
+		argv = sys.argv
+	try:
+		
+		try:
+			opts, args = getopt.getopt(argv[1:], "ht:i:", ["help", "tabfile=", "tobematched="])
+		except getopt.error as msg:
+			raise Usage(msg)
+		# option processing
+		for option, value in opts:
+			if option in ("-h", "--help"):
+				raise Usage(__doc__)
+			if option in ("-t", "--tabfile"):
+				tabfile_fname= value
+			if option in ("-i", "--tobematched"):
+				tobematched_fname = value
+				
+		if (tabfile_fname == "" or tobematched_fname == ""):
+			raise Usage("no input filenames!")
+			sys.exit(2)
+		else:
+			try:
+				#read data from tab file in TabData object
+				file_info = os.stat(tabfile_fname)
+				print("Reading ", tabfile_fname, ' of ' , (file_info[6]/1024), ' kBytes  .......... \n')  
+				print("The results will be written to a file called matchedrows.out")
+
+				properties = TabData()
+				properties.read_from_file(tabfile_fname)
+
+				#read data from list of items in a list
+				file = open(tobematched_fname, 'r')
+				item_list = []
+				for line in file:
+					item_list.append(line.strip('\r\n'))
+				file.close()
+
+				print('Your description file has', properties.number_of_columns(), 'columns')
+				print('The first line of the file says:')
+				print(properties.first_line())
+				print('Enter the number of the column used for match, (1 to '+str(properties.number_of_columns())+'):\n')
+				match_colnumber = int(input())
+
+				#output file path
+				path_toht = os.path.split(tobematched_fname)
+				path_to = path_toht[0]
+				output_filename = os.path.join(path_to,"matchedrows.out")
+
+				result_data = []
+				visual_counter = 0
+				indexed_column = properties.get_column(match_colnumber -1)
+				for item in item_list:
+					int_result = properties.get_row_withindex(item,
+					                                 indexed_column)
+					if int_result[1]==1: #check for return
+						result_data.append(int_result[0])
+					else:
+						result_data.append([item,""])
+						print("Item", item, "not found")
+					visual_counter+=1
+					print(visual_counter, item)
+				out_file = open(output_filename, "w")
+				write_matrix(result_data, out_file)
+				out_file.close()
+			finally:
+				pass
+				
+	except Usage as err:
+		print(sys.argv[0].split("/")[-1] + ": " + str(err.msg), file=sys.stderr)
+		print("\t for help use --help", file=sys.stderr)
+		return 2	
+	
+if __name__ == "__main__":
+    sys.exit(main())
+
+
+
+
-- 
GitLab