Small Commit

2026-01-18 02:55:04 +01:00
parent 9c5a87e61d
commit 06954aeef0
5 changed files with 787 additions and 0 deletions
--- a/de_DE-thorsten-medium.onnx
+++ b/de_DE-thorsten-medium.onnx
--- a/de_DE-thorsten-medium.onnx.json
+++ b/de_DE-thorsten-medium.onnx.json
@@ -0,0 +1,487 @@
+{
+  "audio": {
+    "sample_rate": 22050,
+    "quality": "medium"
+  },
+  "espeak": {
+    "voice": "de"
+  },
+  "inference": {
+    "noise_scale": 0.667,
+    "length_scale": 1,
+    "noise_w": 0.8
+  },
+  "phoneme_type": "espeak",
+  "phoneme_map": {},
+  "phoneme_id_map": {
+    "_": [
+      0
+    ],
+    "^": [
+      1
+    ],
+    "$": [
+      2
+    ],
+    " ": [
+      3
+    ],
+    "!": [
+      4
+    ],
+    "'": [
+      5
+    ],
+    "(": [
+      6
+    ],
+    ")": [
+      7
+    ],
+    ",": [
+      8
+    ],
+    "-": [
+      9
+    ],
+    ".": [
+      10
+    ],
+    ":": [
+      11
+    ],
+    ";": [
+      12
+    ],
+    "?": [
+      13
+    ],
+    "a": [
+      14
+    ],
+    "b": [
+      15
+    ],
+    "c": [
+      16
+    ],
+    "d": [
+      17
+    ],
+    "e": [
+      18
+    ],
+    "f": [
+      19
+    ],
+    "h": [
+      20
+    ],
+    "i": [
+      21
+    ],
+    "j": [
+      22
+    ],
+    "k": [
+      23
+    ],
+    "l": [
+      24
+    ],
+    "m": [
+      25
+    ],
+    "n": [
+      26
+    ],
+    "o": [
+      27
+    ],
+    "p": [
+      28
+    ],
+    "q": [
+      29
+    ],
+    "r": [
+      30
+    ],
+    "s": [
+      31
+    ],
+    "t": [
+      32
+    ],
+    "u": [
+      33
+    ],
+    "v": [
+      34
+    ],
+    "w": [
+      35
+    ],
+    "x": [
+      36
+    ],
+    "y": [
+      37
+    ],
+    "z": [
+      38
+    ],
+    "æ": [
+      39
+    ],
+    "ç": [
+      40
+    ],
+    "ð": [
+      41
+    ],
+    "ø": [
+      42
+    ],
+    "ħ": [
+      43
+    ],
+    "ŋ": [
+      44
+    ],
+    "œ": [
+      45
+    ],
+    "ǀ": [
+      46
+    ],
+    "ǁ": [
+      47
+    ],
+    "ǂ": [
+      48
+    ],
+    "ǃ": [
+      49
+    ],
+    "ɐ": [
+      50
+    ],
+    "ɑ": [
+      51
+    ],
+    "ɒ": [
+      52
+    ],
+    "ɓ": [
+      53
+    ],
+    "ɔ": [
+      54
+    ],
+    "ɕ": [
+      55
+    ],
+    "ɖ": [
+      56
+    ],
+    "ɗ": [
+      57
+    ],
+    "ɘ": [
+      58
+    ],
+    "ə": [
+      59
+    ],
+    "ɚ": [
+      60
+    ],
+    "ɛ": [
+      61
+    ],
+    "ɜ": [
+      62
+    ],
+    "ɞ": [
+      63
+    ],
+    "ɟ": [
+      64
+    ],
+    "ɠ": [
+      65
+    ],
+    "ɡ": [
+      66
+    ],
+    "ɢ": [
+      67
+    ],
+    "ɣ": [
+      68
+    ],
+    "ɤ": [
+      69
+    ],
+    "ɥ": [
+      70
+    ],
+    "ɦ": [
+      71
+    ],
+    "ɧ": [
+      72
+    ],
+    "ɨ": [
+      73
+    ],
+    "ɪ": [
+      74
+    ],
+    "ɫ": [
+      75
+    ],
+    "ɬ": [
+      76
+    ],
+    "ɭ": [
+      77
+    ],
+    "ɮ": [
+      78
+    ],
+    "ɯ": [
+      79
+    ],
+    "ɰ": [
+      80
+    ],
+    "ɱ": [
+      81
+    ],
+    "ɲ": [
+      82
+    ],
+    "ɳ": [
+      83
+    ],
+    "ɴ": [
+      84
+    ],
+    "ɵ": [
+      85
+    ],
+    "ɶ": [
+      86
+    ],
+    "ɸ": [
+      87
+    ],
+    "ɹ": [
+      88
+    ],
+    "ɺ": [
+      89
+    ],
+    "ɻ": [
+      90
+    ],
+    "ɽ": [
+      91
+    ],
+    "ɾ": [
+      92
+    ],
+    "ʀ": [
+      93
+    ],
+    "ʁ": [
+      94
+    ],
+    "ʂ": [
+      95
+    ],
+    "ʃ": [
+      96
+    ],
+    "ʄ": [
+      97
+    ],
+    "ʈ": [
+      98
+    ],
+    "ʉ": [
+      99
+    ],
+    "ʊ": [
+      100
+    ],
+    "ʋ": [
+      101
+    ],
+    "ʌ": [
+      102
+    ],
+    "ʍ": [
+      103
+    ],
+    "ʎ": [
+      104
+    ],
+    "ʏ": [
+      105
+    ],
+    "ʐ": [
+      106
+    ],
+    "ʑ": [
+      107
+    ],
+    "ʒ": [
+      108
+    ],
+    "ʔ": [
+      109
+    ],
+    "ʕ": [
+      110
+    ],
+    "ʘ": [
+      111
+    ],
+    "ʙ": [
+      112
+    ],
+    "ʛ": [
+      113
+    ],
+    "ʜ": [
+      114
+    ],
+    "ʝ": [
+      115
+    ],
+    "ʟ": [
+      116
+    ],
+    "ʡ": [
+      117
+    ],
+    "ʢ": [
+      118
+    ],
+    "ʲ": [
+      119
+    ],
+    "ˈ": [
+      120
+    ],
+    "ˌ": [
+      121
+    ],
+    "ː": [
+      122
+    ],
+    "ˑ": [
+      123
+    ],
+    "˞": [
+      124
+    ],
+    "β": [
+      125
+    ],
+    "θ": [
+      126
+    ],
+    "χ": [
+      127
+    ],
+    "ᵻ": [
+      128
+    ],
+    "ⱱ": [
+      129
+    ],
+    "0": [
+      130
+    ],
+    "1": [
+      131
+    ],
+    "2": [
+      132
+    ],
+    "3": [
+      133
+    ],
+    "4": [
+      134
+    ],
+    "5": [
+      135
+    ],
+    "6": [
+      136
+    ],
+    "7": [
+      137
+    ],
+    "8": [
+      138
+    ],
+    "9": [
+      139
+    ],
+    "̧": [
+      140
+    ],
+    "̃": [
+      141
+    ],
+    "̪": [
+      142
+    ],
+    "̯": [
+      143
+    ],
+    "̩": [
+      144
+    ],
+    "ʰ": [
+      145
+    ],
+    "ˤ": [
+      146
+    ],
+    "ε": [
+      147
+    ],
+    "↓": [
+      148
+    ],
+    "#": [
+      149
+    ],
+    "\"": [
+      150
+    ],
+    "↑": [
+      151
+    ]
+  },
+  "num_symbols": 256,
+  "num_speakers": 1,
+  "speaker_id_map": {},
+  "piper_version": "1.0.0",
+  "language": {
+    "code": "de_DE",
+    "family": "de",
+    "region": "DE",
+    "name_native": "Deutsch",
+    "name_english": "German",
+    "country_english": "Germany"
+  },
+  "dataset": "thorsten"
+}
--- a/main_bak.py
+++ b/main_bak.py
@@ -0,0 +1,227 @@
+import threading
+import queue
+import json
+import time
+import os
+import subprocess
+
+# =========================
+# KONFIGURATION
+# =========================
+
+VOSK_MODEL_PATH = "vosk-model-de-0.21"
+PIPER_BIN = "piper"
+PIPER_MODEL = "de_DE-thorsten-medium.onnx"
+SAMPLE_RATE = 22050
+
+# =========================
+# STATES
+# =========================
+
+STATE_IDLE = "IDLE"
+STATE_LISTENING = "LISTENING"
+STATE_SPEAKING = "SPEAKING"
+
+# =========================
+# GLOBALER ZUSTAND
+# =========================
+
+state = STATE_IDLE
+context = {
+    "intent": None,
+    "slots": {},
+    "required_slots": [],
+    "pending_slot": None
+}
+
+audio_queue = queue.Queue()
+
+# =========================
+# TTS (PIPER)
+# =========================
+
+def speak(text):
+    global state
+    state = STATE_SPEAKING
+    print(f"[TTS] {text}")
+
+    process = subprocess.Popen(
+        [PIPER_BIN, "--model", PIPER_MODEL, "--output-raw"],
+        stdin=subprocess.PIPE,
+        stdout=subprocess.PIPE
+    )
+
+    audio = process.communicate(input=text.encode("utf-8"))[0]
+
+    play = subprocess.Popen(
+        ["aplay", "-r", str(SAMPLE_RATE), "-f", "S16_LE"],
+        stdin=subprocess.PIPE
+    )
+    play.communicate(audio)
+
+    state = STATE_LISTENING
+
+
+# =========================
+# INTENTS & SLOTS
+# =========================
+
+INTENTS = {
+    "weather": {
+        "keywords": ["wetter", "temperatur", "regen"],
+        "required_slots": ["location"]
+    },
+    "timer": {
+        "keywords": ["timer"],
+        "required_slots": ["duration"]
+    }
+}
+
+# evtl mit regex überarbeiten
+
+def detect_intent(text):
+    text = text.lower()
+    for name, data in INTENTS.items():
+        if any(word in text for word in data["keywords"]):
+            return name
+    return None
+
+
+# =========================
+# SKILLS
+# =========================
+
+def weather_skill(slots):
+    location = slots["location"]
+    return f"Das Wetter in {location} ist sonnig bei 20 Grad."
+
+def timer_skill(slots):
+    duration = slots["duration"]
+    return f"Der Timer für {duration} Minuten wurde gestartet."
+
+SKILLS = {
+    "weather": weather_skill,
+    "timer": timer_skill
+}
+
+# =========================
+# DIALOGLOGIK
+# =========================
+
+def handle_text(text):
+    global context, state
+
+    if state != STATE_LISTENING:
+        return
+
+    print(f"[STT] {text}")
+
+    # 1. Rückfrage beantworten
+    if context["pending_slot"]:
+        context["slots"][context["pending_slot"]] = text
+        context["pending_slot"] = None
+
+    # 2. Intent erkennen
+    if not context["intent"]:
+        intent = detect_intent(text)
+        if not intent:
+            speak("Das habe ich nicht verstanden.")
+            reset_context()
+            return
+
+        context["intent"] = intent
+        context["required_slots"] = INTENTS[intent]["required_slots"]
+
+    # 3. Fehlende Slots prüfen
+    for slot in context["required_slots"]:
+        if slot not in context["slots"]:
+            context["pending_slot"] = slot
+            ask_for_slot(slot)
+            return
+
+    # 4. Skill ausführen
+    result = SKILLS[context["intent"]](context["slots"])
+    speak(result)
+    reset_context()
+
+
+def ask_for_slot(slot):
+    questions = {
+        "location": "Für welchen Ort?",
+        "duration": "Wie lange soll der Timer laufen?"
+    }
+    speak(questions.get(slot, "Bitte spezifizieren."))
+
+
+def reset_context():
+    global context, state
+    context = {
+        "intent": None,
+        "slots": {},
+        "required_slots": [],
+        "pending_slot": None
+    }
+    state = STATE_IDLE
+
+
+# =========================
+# VOSK LISTENER
+# =========================
+
+def vosk_listener():
+    import vosk
+    import pyaudio
+
+    model = vosk.Model(VOSK_MODEL_PATH)
+    rec = vosk.KaldiRecognizer(model, SAMPLE_RATE)
+
+    p = pyaudio.PyAudio()
+    stream = p.open(
+        format=pyaudio.paInt16,
+        channels=1,
+        rate=SAMPLE_RATE,
+        input=True,
+        frames_per_buffer=4000
+    )
+    stream.start_stream()
+
+    while True:
+        data = stream.read(4000, exception_on_overflow=False)
+        if rec.AcceptWaveform(data):
+            result = json.loads(rec.Result())
+            text = result.get("text", "")
+            if text:
+                audio_queue.put(text)
+
+
+# =========================
+# WAKEWORD (SIMPLIFIZIERT)
+# =========================
+
+def fake_wakeword_detector():
+    global state
+    while True:
+        if state == STATE_IDLE:
+            time.sleep(0.1)
+            state = STATE_LISTENING
+            speak("Wie kann ich helfen?")
+
+
+# =========================
+# MAIN LOOP
+# =========================
+
+def main():
+    threading.Thread(target=vosk_listener, daemon=True).start()
+    threading.Thread(target=fake_wakeword_detector, daemon=True).start()
+
+    while True:
+        try:
+            text = audio_queue.get(timeout=0.1)
+            handle_text(text)
+        except queue.Empty:
+            pass
+
+
+if __name__ == "__main__":
+    main()
--- a/test_tts.py
+++ b/test_tts.py
@@ -0,0 +1,27 @@
+import subprocess
+
+PIPER_BIN = "piper"
+PIPER_MODEL = "de_DE-thorsten-medium.onnx"
+SAMPLE_RATE = 22050 #aplay -v assistant/audio-tts/predefined.wav so rausgefunden
+
+def speak(text):
+
+    print(f"[TTS] {text}")
+
+    process = subprocess.Popen(
+        [PIPER_BIN, "--model", PIPER_MODEL, "--output-raw"],
+        stdin=subprocess.PIPE,
+        stdout=subprocess.PIPE
+    )
+
+    audio = process.communicate(input=text.encode("utf-8"))[0]
+
+    play = subprocess.Popen(
+        ["aplay", "-r", str(SAMPLE_RATE), "-f", "S16_LE"],
+        stdin=subprocess.PIPE
+    )
+    play.communicate(audio)
+
+    
+text="Das Auto musste repariert werden, bevor wir weiterfahren konnten, neuer text"
+speak(text)
--- a/test_vosk.py
+++ b/test_vosk.py
@@ -0,0 +1,46 @@
+import threading
+import queue
+import json
+
+audio_queue = queue.Queue()
+SAMPLE_RATE = 16000
+
+def vosk_listener():
+    import vosk
+    import pyaudio
+
+    VOSK_MODEL_PATH = "/home/tino/Documents/_Documents/Schule/4 Wilhelm Maybach Schule/2BKI Jahr 2/Abschlussprojekt/test assistant/assistant_all_in_one/vosk-model-de-0.21/"
+
+    model = vosk.Model(VOSK_MODEL_PATH)
+    rec = vosk.KaldiRecognizer(model, SAMPLE_RATE)
+
+    p = pyaudio.PyAudio()
+    stream = p.open(
+        format=pyaudio.paInt16,
+        channels=1,
+        rate=SAMPLE_RATE,
+        input=True,
+        frames_per_buffer=4000
+    )
+    stream.start_stream()
+
+    while True:
+        data = stream.read(4000, exception_on_overflow=False)
+        if rec.AcceptWaveform(data):
+            result = json.loads(rec.Result())
+            text = result.get("text", "")
+            if text:
+                audio_queue.put(text)
+
+def main():
+    threading.Thread(target=vosk_listener, daemon=True).start()
+
+    while True:
+        try:
+            text = audio_queue.get(timeout=0.1)
+            print("[SST]", text)
+        except queue.Empty:
+            pass
+
+if __name__ == "__main__":
+    main()