From af780224bb9b21467cdb2e260802a9094f11f615 Mon Sep 17 00:00:00 2001
From: Bryan Brancotte <bryan.brancotte@pasteur.fr>
Date: Wed, 19 Feb 2025 16:29:27 +0100
Subject: [PATCH 1/2] cleanup

---
 src/strass/test_data/README.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/strass/test_data/README.md b/src/strass/test_data/README.md
index 5204054d..5a24bdc4 100644
--- a/src/strass/test_data/README.md
+++ b/src/strass/test_data/README.md
@@ -10,6 +10,4 @@ writer = PdfWriter(clone_from="cv.pdf")
 writer.add_js('app.alert("ICanSubmitTheContentOfThisFileAnywhere.");')
 with open("cv-with-js.pdf", "wb") as fp:
     writer.write(fp)
-
-writer = PdfWriter(clone_from="cv-with-js.pdf")
 ```
\ No newline at end of file
-- 
GitLab


From 0883b3e4a48ea580e28cd984d1d98d7784c62ad1 Mon Sep 17 00:00:00 2001
From: Bryan Brancotte <bryan.brancotte@pasteur.fr>
Date: Wed, 19 Feb 2025 16:31:58 +0100
Subject: [PATCH 2/2] convert to PostScript to mitigate attack with pdf, test
 independently each method

---
 src/strass/Dockerfile                         |  1 +
 .../strass_app/tests/test_sanitize_pdf.py     | 14 ++++++++
 src/strass/strass_app/utils.py                | 36 ++++++++++++++++++-
 3 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/src/strass/Dockerfile b/src/strass/Dockerfile
index 82313de8..435156d6 100644
--- a/src/strass/Dockerfile
+++ b/src/strass/Dockerfile
@@ -20,6 +20,7 @@ RUN addgroup --gid 1000 kiwi \
         gcc \
         g++ \
         libmagic1 \
+        ghostscript \
  && rm -rf /var/lib/apt/lists/* \
  && python -m pip install --upgrade pip \
  && pip install csscompressor gunicorn
diff --git a/src/strass/strass_app/tests/test_sanitize_pdf.py b/src/strass/strass_app/tests/test_sanitize_pdf.py
index fab61988..95360005 100644
--- a/src/strass/strass_app/tests/test_sanitize_pdf.py
+++ b/src/strass/strass_app/tests/test_sanitize_pdf.py
@@ -21,3 +21,17 @@ class SafePDFTestCase(TooledTestCase):
         cv.seek(0)
         my_io = utils.safe_pdf(cv)
         self.check_no_js(my_io)
+
+    def test_pdf_file_cleanup_with_js(self):
+        cv = open(os.path.join(self.test_data, "cv-with-js.pdf"), "rb")
+        self.assertIn('ICanSubmitTheContentOfThisFileAnywhere', str(cv.read()))
+        cv.seek(0)
+        my_io = utils._pdf_file_cleanup(cv)
+        self.check_no_js(my_io)
+
+    def test_pdf_ps_pdf_with_js(self):
+        cv = open(os.path.join(self.test_data, "cv-with-js.pdf"), "rb")
+        self.assertIn('ICanSubmitTheContentOfThisFileAnywhere', str(cv.read()))
+        cv.seek(0)
+        my_io = utils._pdf_ps_pdf(cv)
+        self.check_no_js(my_io)
diff --git a/src/strass/strass_app/utils.py b/src/strass/strass_app/utils.py
index 7268d2c6..5c9a0079 100644
--- a/src/strass/strass_app/utils.py
+++ b/src/strass/strass_app/utils.py
@@ -1,5 +1,6 @@
 import functools
 import logging
+import subprocess
 from abc import abstractmethod
 from io import BytesIO
 from typing import IO, Any
@@ -283,7 +284,10 @@ def _pdf_object_cleanup(pdf_file, obj):
             del obj[k]
 
 
-def safe_pdf(my_stream: IO[Any]):
+def _pdf_file_cleanup(my_stream: IO[Any]):
+    """
+    Strip out Javascript code from the pdf file
+    """
     writer = PdfWriter(clone_from=my_stream)
     _pdf_object_cleanup(writer, writer.root_object.get('/Names', {}))
 
@@ -295,6 +299,36 @@ def safe_pdf(my_stream: IO[Any]):
     return myio
 
 
+def _pdf_ps_pdf(my_stream: IO[Any]):
+    """
+    Convert file to posts script and then back to pdf to purge code and dynamic content
+    """
+    pdf2ps = subprocess.Popen(
+        ('pdf2ps', '-dLanguageLevel=1', '-', '-'),
+        stdin=subprocess.PIPE,
+        stdout=subprocess.PIPE,
+    )
+    cv_ps = pdf2ps.communicate(input=my_stream.read())[0]
+    pdf2ps.wait()
+
+    ps2pdf = subprocess.Popen(
+        ('ps2pdf', '-', '-'),
+        stdin=subprocess.PIPE,
+        stdout=subprocess.PIPE,
+    )
+    cv_safe = ps2pdf.communicate(input=cv_ps)[0]
+    ps2pdf.wait()
+
+    myio = BytesIO(cv_safe)
+    return myio
+
+
+def safe_pdf(my_stream: IO[Any]):
+    my_stream = _pdf_file_cleanup(my_stream)
+    my_stream = _pdf_ps_pdf(my_stream)
+    return my_stream
+
+
 def use_markdown_or_plain_text_message() -> str:
     if live_settings.markdown_enabled__bool:
         return _('You can use markdown here.')
-- 
GitLab