almost working base feature set

26a1e452 · Timothe Jost · 012b0095 · 26a1e452 · 26a1e452 · 26a1e452
Commit 26a1e452 authored 1 year ago by Timothe Jost
--- a/.coverage
+++ b/.coverage
--- a/pypelines.egg-info/PKG-INFO
+++ b/pypelines.egg-info/PKG-INFO
+Metadata-Version: 2.1
+Name: pypelines
+Version: 0.0.1
+Summary: Framework to organize processing code outputs to/from disk, processing chaining and versionning with a common easy to use api
+Home-page: https://gitlab.pasteur.fr/haisslab/data-management/pypelines
+Author: Timothé Jost-MOUSSEAU
+Author-email: timothe.jost-mousseau@pasteur.com
+License: MIT
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.6
+Classifier: Programming Language :: Python :: 3.7
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
--- a/pypelines.egg-info/SOURCES.txt
+++ b/pypelines.egg-info/SOURCES.txt
+README.md
+setup.py
+pypelines/__init__.py
+pypelines/disk.py
+pypelines/examples.py
+pypelines/loggs.py
+pypelines/multisession.py
+pypelines/pickle_backend.py
+pypelines/pipe.py
+pypelines/pipeline.py
+pypelines/sessions.py
+pypelines/step.py
+pypelines/versions.py
+pypelines.egg-info/PKG-INFO
+pypelines.egg-info/SOURCES.txt
+pypelines.egg-info/dependency_links.txt
+pypelines.egg-info/top_level.txt
+tests/__init__.py
+tests/tests.py
\ No newline at end of file
--- a/pypelines.egg-info/dependency_links.txt
+++ b/pypelines.egg-info/dependency_links.txt
+
--- a/pypelines.egg-info/top_level.txt
+++ b/pypelines.egg-info/top_level.txt
+pypelines
+tests
--- a/pypelines/__init__.py
+++ b/pypelines/__init__.py
@@ -3,4 +3,5 @@ __version__ = "0.0.1"
 from .pipe import *
 from .pipeline import *
 from .step import *
+from .disk import *
 from .versions import *
\ No newline at end of file
--- a/pypelines/__pycache__/__init__.cpython-311.pyc
+++ b/pypelines/__pycache__/__init__.cpython-311.pyc
--- a/pypelines/__pycache__/__init__.cpython-39.pyc
+++ b/pypelines/__pycache__/__init__.cpython-39.pyc
--- a/pypelines/__pycache__/disk.cpython-311.pyc
+++ b/pypelines/__pycache__/disk.cpython-311.pyc
--- a/pypelines/__pycache__/examples.cpython-311.pyc
+++ b/pypelines/__pycache__/examples.cpython-311.pyc
--- a/pypelines/__pycache__/loggs.cpython-39.pyc
+++ b/pypelines/__pycache__/loggs.cpython-39.pyc
--- a/pypelines/__pycache__/multisession.cpython-39.pyc
+++ b/pypelines/__pycache__/multisession.cpython-39.pyc
--- a/pypelines/__pycache__/pipe.cpython-311.pyc
+++ b/pypelines/__pycache__/pipe.cpython-311.pyc
--- a/pypelines/__pycache__/pipe.cpython-39.pyc
+++ b/pypelines/__pycache__/pipe.cpython-39.pyc
--- a/pypelines/__pycache__/pipeline.cpython-311.pyc
+++ b/pypelines/__pycache__/pipeline.cpython-311.pyc
--- a/pypelines/__pycache__/sessions.cpython-39.pyc
+++ b/pypelines/__pycache__/sessions.cpython-39.pyc
--- a/pypelines/__pycache__/step.cpython-311.pyc
+++ b/pypelines/__pycache__/step.cpython-311.pyc
--- a/pypelines/__pycache__/step.cpython-39.pyc
+++ b/pypelines/__pycache__/step.cpython-39.pyc
--- a/pypelines/disk.py
+++ b/pypelines/disk.py
-import os
+import os, re
 from . sessions import Session
+import pickle

 from typing import Callable, Type, Iterable, Protocol, TYPE_CHECKING

@@ -16,9 +17,8 @@ class BaseDiskObject :
    disk_version = None
    disk_step = None

-    def __init__(self, session : Session, step : BaseStep, extra = "") -> None :
+    def __init__(self, session : Session, step : "BaseStep", extra = "") -> None :

-        self.step = None
        self.session = session
        self.step = step
        self.extra = extra
@@ -29,7 +29,7 @@ class BaseDiskObject :
        """sets self.disk_version and self.disk_step"""
        ...

-    def save(self, object):
+    def save(self, data : OutputData) -> None:
        ...

    def load(self) -> OutputData:
@@ -42,7 +42,6 @@ class BaseDiskObject :
    def version_exist(self, session : Session):
        """returns True if the file found had a stamp for that step corresponding to the current version. False otherwise""" 
        return self.step.version == self.disk_version
-    

 class PickleObject(BaseDiskObject) :

@@ -50,35 +49,88 @@ class PickleObject(BaseDiskObject) :
    file_prefix = "preproc_data"
    extension = "pickle"
    current_suffixes = ""
+    remove = True
+    current_disk_file = None

-    def make_file_prefix_path(self):
-        prefix_path = self.file_prefix + "." + self.step.pipe_name
-        rigid_pattern = self.file_prefix
+    def parse_extra(self,extra):
+        extra = extra.strip(".").replace(".",r"\.")
+        return r"\." + extra if extra else ""

-        pattern = ""
+    def make_file_name_pattern(self):

-        if self.step.pipe.single_step :
-            pass
+        steps_patterns = []

-        if self.step.use_version :
-            pass
+        for key in sorted(self.step.pipe.steps.keys()):

+            step = self.step.pipe.steps[key]
+            steps_patterns.append( fr"(?:{step.step_name})" )

-        flexible_pattern = self.f
+        steps_patterns = "|".join(steps_patterns)

-    def check_disk(self):
-        search_path = os.path.join(self.session.path, self.collection)
+        version_pattern = fr"(?:\.(?P<version>[^\.]*))?"
+        step_pattern = fr"(?:\.(?P<step_name>{steps_patterns}){version_pattern})?"
        
+        extra = self.parse_extra(self.extra)
+                
+        pattern = self.file_prefix + r"\." + self.step.pipe_name + step_pattern + extra + r"\." + self.extension
+        print(pattern)
+        return pattern
+    
+    def get_file_name(self):

-    def save(self, object):
-        ...
+        extra = self.parse_extra(self.extra)
+        version_string = "." + self.step.version if self.step.use_version else ""
+        filename = self.file_prefix + "." + self.step.pipe_name + "." + self.step.step_name + version_string + extra + "." + self.extension
+        return filename

-    def load(self) -> OutputData:
-        ...
+    def check_disk(self):
+        search_path = os.path.join(self.session.path, os.path.sep.join(self.collection))
+        print(search_path)
+        matching_files = files(search_path, re_pattern = self.make_file_name_pattern(), relative = True, levels = 0)
+        print(matching_files)
+        if len(matching_files):
+            keys = ["step_name","version"]
+            expected_values = {"step_name" : self.step.step_name, "version" : self.step.version if self.step.use_version else None}
+            pattern = re.compile(self.make_file_name_pattern())
+            match_datas = []
+            for index, file in enumerate(matching_files) :
+                match = pattern.search(file)
+                match_data = {}
+                for key in keys :
+                    match_data[key] = match.group(key)
+                    #TODO : catch here with KeyError and return an error that is more explicit, saying key is not present in the pattern
+                if expected_values == match_data :
+                    self.current_disk_file = os.path.join(search_path, matching_files[index])
+                    return True
+                match_datas.append(match_data)
+            else :            
+                if len(match_datas) == 1:
+                    print(f"A single partial match was found. Please make sure it is consistant with expected behaviour. Expected : {expected_values} , Found : {match_datas[0]}") 
+                    self.current_disk_file = os.path.join(search_path, matching_files[0])
+                    return True
+                print(f"More than one partial match were found. Cannot auto select. Expected : {expected_values} , Found : {match_datas}")   
+                return False
+        return False
+    
+    def get_full_path(self):
+        full_path = os.path.join(self.session.path, os.path.sep.join(self.collection), self.get_file_name() )
+        return full_path
+
+    def save(self, data : OutputData):
+        new_full_path = self.get_full_path()
+        with open(new_full_path, "wb") as f :
+            pickle.dump(data, f)
+        if self.current_disk_file is not None and self.current_disk_file != new_full_path and self.remove :
+            os.remove(self.current_disk_file)
+        self.current_disk_file = new_full_path

+    def load(self) -> OutputData:
+        if self.current_disk_file is None :
+            raise IOError("Could not find a file to load. Either no file was found on disk, or you forgot to run 'check_disk()'")
+        with open(self.current_disk_file, "rb") as f :
+            return pickle.load(f)

 import natsort
-from . import extract

 def files(input_path, re_pattern = None, relative = False,levels = -1, get = "files", parts = "all", sort = True):
    """
@@ -103,11 +155,11 @@ def files(input_path, re_pattern = None, relative = False,levels = -1, get = "fi
        for subdir in os.listdir(_input_path):
            fullpath = os.path.join(_input_path,subdir)
            if os.path.isfile(fullpath): 
-                if (get == "all" or get == "files") and (re_pattern is None or extract.qregexp(re_pattern,fullpath)):
+                if (get == "all" or get == "files") and (re_pattern is None or qregexp(re_pattern,fullpath)):
                    output_list.append(os.path.normpath(fullpath))
                    
            else :
-                if (get == "all" or get == "dirs" or get == "folders") and (re_pattern is None or extract.qregexp(re_pattern,fullpath)):
+                if (get == "all" or get == "dirs" or get == "folders") and (re_pattern is None or qregexp(re_pattern,fullpath)):
                    output_list.append(os.path.normpath(fullpath))
                if current_level < levels:
                    current_level += 1 
@@ -129,4 +181,62 @@ def files(input_path, re_pattern = None, relative = False,levels = -1, get = "fi
        

    
+def qregexp(regex, input_line, groupidx=None, matchid=None , case=False):
+    """
+    Simplified implementation for matching regular expressions. Utility for python's built_in module re .
+
+    Tip:
+        Design your patterns easily at [Regex101](https://regex101.com/)
+
+    Args:
+        input_line (str): Source on wich the pattern will be searched.
+        regex (str): Regex pattern to match on the source.
+        **kwargs (optional):
+            - groupidx : (``int``)
+                group index in case there is groups. Defaults to None (first group returned)
+            - matchid : (``int``)
+                match index in case there is multiple matchs. Defaults to None (first match returned)
+            - case : (``bool``)
+                `False` / `True` : case sensitive regexp matching (default ``False``)
+
+    Returns:
+        Bool , str: False or string containing matched content.
+
+    Warning:
+        This function returns only one group/match.
+
+    """
+
+    if case :
+        matches = re.finditer(regex, input_line, re.MULTILINE|re.IGNORECASE)
+    else :
+        matches = re.finditer(regex, input_line, re.MULTILINE)
+
+    if matchid is not None :
+        matchid = matchid +1
+
+    for matchnum, match in enumerate(matches,  start = 1):
+
+        if matchid is not None :
+            if matchnum == matchid :
+                if groupidx is not None :
+                    for groupx, groupcontent in enumerate(match.groups()):
+                        if groupx == groupidx :
+                            return groupcontent
+                    return False
+
+                else :
+                    MATCH = match.group()
+                    return MATCH
+
+        else :
+            if groupidx is not None :
+                for groupx, groupcontent in enumerate(match.groups()):
+                    if groupx == groupidx :
+                        return groupcontent
+                return False
+            else :
+                MATCH = match.group()
+                return MATCH
+    return False
        
\ No newline at end of file
--- a/pypelines/examples.py
+++ b/pypelines/examples.py
@@ -6,7 +6,7 @@ from .step import stepmethod
 class ExamplePipeline(BasePipeline):
    ...

-example_pipeline = ExamplePipeline()
+example_pipeline = ExamplePipeline("example")

 @example_pipeline.register_pipe
 class ExamplePipe(PicklePipe):