Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
sequence-bioinformatics
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Nicolas MAILLET
sequence-bioinformatics
Commits
b4db52bf
Commit
b4db52bf
authored
1 year ago
by
Nicolas MAILLET
Browse files
Options
Downloads
Patches
Plain Diff
CM-TP regex
parent
4f70e54a
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
8-Regex/tp8.py
+202
-0
202 additions, 0 deletions
8-Regex/tp8.py
with
202 additions
and
0 deletions
8-Regex/tp8.py
0 → 100644
+
202
−
0
View file @
b4db52bf
"""
TP8 where we play with regex
"""
# Import regex
import
re
def
read_file_old
(
file_in
):
"""
Read the file and return its content (array of tuples)
"""
content
=
[]
# Open the file
with
open
(
file_in
)
as
file
:
# The header of the current sequence
header
=
False
# For each line
for
line
in
file
:
# If the header is False, this is a new sequence
if
not
header
:
# Backup the header
header
=
line
.
strip
()
# Header is not False, we are in a sequence
else
:
# Create a tuple (not modifiable) with the header and the sequence
tmp
=
(
header
,
line
.
strip
())
# Add it to the returned list
content
.
append
(
tmp
)
# Put header to False
header
=
False
# Return the content of the file
return
content
def
read_file
(
file_in
):
"""
Read the file and return its content (array of tuples)
"""
content
=
[]
# Open the file
with
open
(
file_in
)
as
file
:
# For each line
for
line
in
file
:
# Get the header
header
=
line
.
strip
()
# Get the sequence
seq
=
file
.
readline
().
strip
()
# Create a tuple (not modifiable) with the header and the sequence
tmp
=
(
header
,
seq
)
# Add it to the returned list
content
.
append
(
tmp
)
# Return the content of the file
return
content
def
main
():
"""
The main of TP8 that launch regex
"""
# The file to process
file_in
=
"
sequences.fasta
"
# Get its content
content
=
read_file
(
file_in
)
# Get all occurrences of 'GTA'
res
=
[]
# For each headers/sequences
for
i
in
content
:
# Regex on the sequence only
res
+=
re
.
findall
(
"
GTA
"
,
i
[
1
])
print
(
"
There is {}
'
GTA
'
in the file
"
.
format
(
len
(
res
)))
# Is there a sequence containing 'GTA(some characters)CT'?
# Not founded yet
founded
=
False
# For each headers/sequences
for
i
in
content
:
# Regex on the sequence only
if
re
.
search
(
"
GTA.*CT
"
,
i
[
1
]):
# We found one!
founded
=
True
# Stop the process
break
# Did we find it?
if
founded
:
print
(
"
\n
There is!
"
)
else
:
print
(
"
\n
There is not :(
"
)
# Is there a sequence containing 'GTA(max 3 characters)CTAAT'?
# Not founded yet
founded
=
False
# For each headers/sequences
for
i
in
content
:
# Regex on the sequence only
if
re
.
search
(
"
GTA.{0,3}CTAAT
"
,
i
[
1
]):
# We found one!
founded
=
True
# Stop the process
break
# Did we find it?
if
founded
:
print
(
"
\n
There is!
"
)
else
:
print
(
"
\n
There is not :(
"
)
# Is there a sequence containing 'GG T or C GG'?
# Not founded yet
founded
=
False
# For each headers/sequences
for
i
in
content
:
# Regex on the sequence only
if
re
.
search
(
"
GG[TC]GG
"
,
i
[
1
]):
# We found one!
founded
=
True
# Stop the process
break
# Did we find it?
if
founded
:
print
(
"
\n
There is!
"
)
else
:
print
(
"
\n
There is not :(
"
)
# Is there a sequence finishing by 'ATATAT'?
# Not founded yet
founded
=
False
# For each headers/sequences
for
i
in
content
:
# Regex on the sequence only
if
re
.
search
(
"
ATATAT$
"
,
i
[
1
]):
# We found one!
founded
=
True
# Stop the process
break
# Did we find it?
if
founded
:
print
(
"
\n
There is!
"
)
else
:
print
(
"
\n
There is not :(
"
)
# Is there a sequence starting or finishing by 'ATATAT'?
# Not founded yet
founded
=
False
# For each headers/sequences
for
i
in
content
:
# Regex on the sequence only
if
re
.
search
(
"
^ATATAT|ATATAT$
"
,
i
[
1
]):
# We found one!
founded
=
True
# Stop the process
break
# Did we find it?
if
founded
:
print
(
"
\n
There is!
"
)
else
:
print
(
"
\n
There is not :(
"
)
# Get headers containing mmus or musm
res
=
[]
# For each headers/sequences
for
i
in
content
:
# Regex on the header only, starting and ending by anything and containing something that is not A, C, G or T
if
re
.
search
(
"
mmus|musm
"
,
i
[
0
]):
res
.
append
(
i
[
0
])
print
(
"
\n
Mus Musculus headers: {}
"
.
format
(
res
))
# Count headers containing / or \
res
=
[]
# For each headers/sequences
for
i
in
content
:
# Regex on the header only, you need to escape the escape... \\ is a literal \ IN THE REGEX, so you need to escape it. Python only.
if
re
.
search
(
"
\\\\
|
\\
/
"
,
i
[
0
]):
res
.
append
(
i
[
0
])
print
(
"
\n
There is {} headers containing (back)slash
"
.
format
(
len
(
res
)))
# Find the sequence containing not only DNA
res
=
[]
# For each headers/sequences
for
i
in
content
:
# Regex on the sequence only, starting and ending by anything and containing something that is not A, C, G or T
res
+=
re
.
findall
(
"
^.*[^ACGT].*$
"
,
i
[
1
])
print
(
"
\n
Buggy sequence: {}
"
.
format
(
res
))
# Get the part of headers containing an id composed of 3 letters, 1 digit, 1 alphanumeric character, 1 character and surrounded by spaces
res
=
[]
# For each headers/sequences
for
i
in
content
:
# Regex on the header only
res
+=
re
.
findall
(
"
\ [A-Za-z]{3}\d{1}\w{1}.{1}\
"
,
i
[
0
])
print
(
"
\n
Special header sequences: {}
"
.
format
(
res
))
# Get the sequence where the header contains an email address.
res
=
""
# For each headers/sequences
for
i
in
content
:
# Regex on the header only
if
re
.
search
(
"
[^\W][a-zA-Z0-9_]+(\.[a-zA-Z0-9_]+)*\@[a-zA-Z0-9_]+(\.[a-zA-Z0-9_]+)*\.[a-zA-Z]{2,4}
"
,
i
[
0
]):
res
=
i
[
1
]
print
(
"
\n
Sequence with email on the header: {}
"
.
format
(
res
))
# Launch the main
main
()
# Exit without error
exit
(
0
)
# Always put one extra return line
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment