RWTH-TIME · PaulKalho · Jan 17, 2026 · Dec 9, 2025 · Dec 10, 2025 · Dec 10, 2025
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -65,18 +65,6 @@ jobs:
     runs-on: ubuntu-latest
     needs: validate-compute-block
     services:
-      minio:
-        image: lazybit/minio
-        ports:
-          - 9000:9000
-        env:
-          MINIO_ROOT_USER: minioadmin
-          MINIO_ROOT_PASSWORD: minioadmin
-        options: >-
-          --health-cmd "curl -f http://localhost:9000/minio/health/live || exit 1"
-          --health-interval 5s
-          --health-retries 5
-          --health-timeout 5s
       postgres:
         image: postgres:15
         ports:

diff --git a/algorithms/lda.py b/algorithms/lda.py
@@ -3,6 +3,8 @@
 import numpy as np
 import pandas as pd
 
+from typing import List
+
 from sklearn.decomposition import LatentDirichletAllocation
 
 logger = logging.getLogger(__name__)
@@ -13,6 +15,7 @@ def __init__(
         self,
         dtm: np.ndarray = None,
         vocab: dict = None,
+        doc_ids: List[str] = [],
         n_topics: int = 10,
         max_iter: int = 10,
         learning_method: str = "batch",
@@ -21,6 +24,12 @@ def __init__(
     ):
         self.dtm: np.ndarray = dtm
         self.vocab: dict = vocab
+        self.doc_ids = doc_ids
+
+        if len(self.doc_ids) != self.dtm.shape[0]:
+            raise ValueError(
+                "doc_ids length must match number of DTM rows"
+            )
 
         self.n_topics = n_topics
         self.max_iter = max_iter
@@ -58,6 +67,7 @@ def extract_doc_topics(self) -> pd.DataFrame:
             self.doc_topic_dist,
             columns=[f"topic_{i}" for i in range(self.n_topics)],
         )
+        df.insert(0, "doc_id", self.doc_ids)
 
         logger.debug(
             f"Extracted doc-topic distribution DataFrame shape={df.shape}")
@@ -68,17 +78,28 @@ def extract_topic_terms(self):
         Generate topic and top-terms DataFrame
         """
         logger.info("Extracting top terms per topic...")
-        idx2term = {idx: term for term, idx in self.vocab.items()}
+
+        # NOTE:
+        # The order of `terms` is guaranteed to match the DTM column order.
+        # This is because the vocabulary is built in NLPVectorizer using:
+        #   sorted_terms = sorted(all_terms)
+        #   vocab = {term: i for i, term in enumerate(sorted_terms)}
+        # The same vocab indices are then used to construct the DTM columns.
+        # Since Python dicts preserve insertion order (>=3.7),
+        # list(self.vocab.keys())[i] correctly maps to DTM column i,
+        # and thus to lda.components_[topic_idx][i].
+        terms = list(self.vocab.keys())
         topic_rows = []
 
         for topic_idx, topic in enumerate(self.lda.components_):
-            sorted = np.argsort(topic)[::-1]
-            top_indices = sorted[:self.n_top_words]
+            sorted_idx = np.argsort(topic)[::-1]
+            top_indices = sorted_idx[: self.n_top_words]
+
             for i in top_indices:
                 topic_rows.append({
                     "topic_id": topic_idx,
-                    "term": idx2term[i],
-                    "weight": topic[i]
+                    "term": terms[int(i)],
+                    "weight": topic[i],
                 })
 
         df = pd.DataFrame(topic_rows)

diff --git a/algorithms/models.py b/algorithms/models.py
@@ -0,0 +1,8 @@
+from typing import List
+from dataclasses import dataclass
+
+
+@dataclass
+class PreprocessedDocument:
+    doc_id: str
+    tokens: List[str]
diff --git a/algorithms/vectorizer.py b/algorithms/vectorizer.py
@@ -0,0 +1,103 @@
+from typing import List
+import numpy as np
+from collections import Counter
+
+from algorithms.models import PreprocessedDocument
+
+
+class NLPVectorizer:
+    def __init__(self, preprocessed_output: List[PreprocessedDocument]):
+        self.documents = preprocessed_output
+        self.doc_ids = [doc.doc_id for doc in preprocessed_output]
+
+        # Frequencies
+        self.token_frequency = Counter()
+        self.token_document_frequency = Counter()
+        self.ngram_frequency = Counter()
+        self.ngram_document_frequency = Counter()
+
+        # bow + dtm
+        self.bag_of_words = []
+        self.vocab = {}
+        self.reverse_vocab = []
+        self.dtm = None
+
+    def analyze_frequencies(self):
+        for doc in self.documents:
+            tokens = [t for t in doc.tokens if " " not in t]
+            ngrams = [t for t in doc.tokens if " " in t]
+
+            # token frequencies
+            self.token_frequency.update(tokens)
+            self.token_document_frequency.update(set(tokens))
+
+            # ngram frequencies
+            self.ngram_frequency.update(ngrams)
+            self.ngram_document_frequency.update(set(ngrams))
+
+    def build_bow(self):
+        bow = []
+
+        for doc in self.documents:
+            entries = []
+            unique = set()
+
+            for term in doc.tokens:
+                if term in unique:
+                    continue
+                unique.add(term)
+
+                is_ngram = " " in term
+
+                entry = {
+                    "term": term,
+                    "type": "ngram" if is_ngram else "word",
+                    "span": len(term.split(" ")),
+                    "freq": (
+                        self.ngram_frequency[term]
+                        if is_ngram
+                        else self.token_frequency[term]
+                    ),
+                    "docs": (
+                        self.ngram_document_frequency[term]
+                        if is_ngram
+                        else self.token_document_frequency[term]
+                    ),
+                    "filters": []
+                }
+
+                entries.append(entry)
+
+            bow.append(entries)
+
+        self.bag_of_words = bow
+        return bow
+
+    def build_vocabulary(self):
+        all_terms = set()
+
+        for doc in self.documents:
+            for term in doc.tokens:
+                all_terms.add(term)
+
+        sorted_terms = sorted(all_terms)
+        self.vocab = {term: i for i, term in enumerate(sorted_terms)}
+        self.reverse_vocab = sorted_terms
+
+        return self.vocab
+
+    def build_dtm(self):
+        if not self.vocab:
+            self.build_vocabulary()
+
+        num_docs = len(self.documents)
+        num_terms = len(self.vocab)
+
+        dtm = np.zeros((num_docs, num_terms), dtype=int)
+
+        for i, doc in enumerate(self.documents):
+            for term in doc.tokens:
+                dtm[i, self.vocab[term]] += 1
+
+        self.dtm = dtm
+        return dtm
diff --git a/cbc.yaml b/cbc.yaml
@@ -1,39 +1,24 @@
 author: Paul Kalhorn
-description: Compute Block that offers Topic Modeling Algorihtms
+description: Compute Block that offers topic modeling algorithms
 docker_image: ghcr.io/rwth-time/topic-modeling/topic-modeling
 entrypoints:
   lda_topic_modeling:
-    description: Sklearn LDA Topic Modeling 
+    description: Sklearn LDA Topic Modeling
     envs:
       LEARNING_METHOD: batch
       MAX_ITER: 10
       N_TOPICS: 5
       N_TOP_WORDS: 10
     inputs:
-      dtm:
+      preprocessed_docs:
         config:
-          dtm_BUCKET_NAME: null
-          dtm_FILE_EXT: pkl
-          dtm_FILE_NAME: null
-          dtm_FILE_PATH: null
-          dtm_S3_ACCESS_KEY: null
-          dtm_S3_HOST: null
-          dtm_S3_PORT: null
-          dtm_S3_SECRET_KEY: null
-        description: Pkl file of your numpy representation of the document-term matrix
-        type: file
-      vocab:
-        config:
-          vocab_BUCKET_NAME: null
-          vocab_FILE_EXT: pkl
-          vocab_FILE_NAME: null
-          vocab_FILE_PATH: null
-          vocab_S3_ACCESS_KEY: null
-          vocab_S3_HOST: null
-          vocab_S3_PORT: null
-          vocab_S3_SECRET_KEY: null
-        description: Pkl file of a dictionary that maps all words to their index in the DTM
-        type: file
+          preprocessed_docs_DB_TABLE: null
+          preprocessed_docs_PG_HOST: null
+          preprocessed_docs_PG_PASS: null
+          preprocessed_docs_PG_PORT: null
+          preprocessed_docs_PG_USER: null
+        description: A database table, expected to have the doc_id, and tokens (list of strings) 
+        type: pg_table
     outputs:
       doc_topic:
         config:

diff --git a/dtm.pkl b/dtm.pkl