Small Commit

2026-01-18 02:55:04 +01:00
parent 9c5a87e61d
commit 06954aeef0
5 changed files with 787 additions and 0 deletions
--- a/de_DE-thorsten-medium.onnx
+++ b/de_DE-thorsten-medium.onnx
--- a/de_DE-thorsten-medium.onnx.json
+++ b/de_DE-thorsten-medium.onnx.json
@@ -0,0 +1,487 @@
 {
  "audio": {
    "sample_rate": 22050,
    "quality": "medium"
  },
  "espeak": {
    "voice": "de"
  },
  "inference": {
    "noise_scale": 0.667,
    "length_scale": 1,
    "noise_w": 0.8
  },
  "phoneme_type": "espeak",
  "phoneme_map": {},
  "phoneme_id_map": {
    "_": [
      0
    ],
    "^": [
      1
    ],
    "$": [
      2
    ],
    " ": [
      3
    ],
    "!": [
      4
    ],
    "'": [
      5
    ],
    "(": [
      6
    ],
    ")": [
      7
    ],
    ",": [
      8
    ],
    "-": [
      9
    ],
    ".": [
      10
    ],
    ":": [
      11
    ],
    ";": [
      12
    ],
    "?": [
      13
    ],
    "a": [
      14
    ],
    "b": [
      15
    ],
    "c": [
      16
    ],
    "d": [
      17
    ],
    "e": [
      18
    ],
    "f": [
      19
    ],
    "h": [
      20
    ],
    "i": [
      21
    ],
    "j": [
      22
    ],
    "k": [
      23
    ],
    "l": [
      24
    ],
    "m": [
      25
    ],
    "n": [
      26
    ],
    "o": [
      27
    ],
    "p": [
      28
    ],
    "q": [
      29
    ],
    "r": [
      30
    ],
    "s": [
      31
    ],
    "t": [
      32
    ],
    "u": [
      33
    ],
    "v": [
      34
    ],
    "w": [
      35
    ],
    "x": [
      36
    ],
    "y": [
      37
    ],
    "z": [
      38
    ],
    "æ": [
      39
    ],
    "ç": [
      40
    ],
    "ð": [
      41
    ],
    "ø": [
      42
    ],
    "ħ": [
      43
    ],
    "ŋ": [
      44
    ],
    "œ": [
      45
    ],
    "ǀ": [
      46
    ],
    "ǁ": [
      47
    ],
    "ǂ": [
      48
    ],
    "ǃ": [
      49
    ],
    "ɐ": [
      50
    ],
    "ɑ": [
      51
    ],
    "ɒ": [
      52
    ],
    "ɓ": [
      53
    ],
    "ɔ": [
      54
    ],
    "ɕ": [
      55
    ],
    "ɖ": [
      56
    ],
    "ɗ": [
      57
    ],
    "ɘ": [
      58
    ],
    "ə": [
      59
    ],
    "ɚ": [
      60
    ],
    "ɛ": [
      61
    ],
    "ɜ": [
      62
    ],
    "ɞ": [
      63
    ],
    "ɟ": [
      64
    ],
    "ɠ": [
      65
    ],
    "ɡ": [
      66
    ],
    "ɢ": [
      67
    ],
    "ɣ": [
      68
    ],
    "ɤ": [
      69
    ],
    "ɥ": [
      70
    ],
    "ɦ": [
      71
    ],
    "ɧ": [
      72
    ],
    "ɨ": [
      73
    ],
    "ɪ": [
      74
    ],
    "ɫ": [
      75
    ],
    "ɬ": [
      76
    ],
    "ɭ": [
      77
    ],
    "ɮ": [
      78
    ],
    "ɯ": [
      79
    ],
    "ɰ": [
      80
    ],
    "ɱ": [
      81
    ],
    "ɲ": [
      82
    ],
    "ɳ": [
      83
    ],
    "ɴ": [
      84
    ],
    "ɵ": [
      85
    ],
    "ɶ": [
      86
    ],
    "ɸ": [
      87
    ],
    "ɹ": [
      88
    ],
    "ɺ": [
      89
    ],
    "ɻ": [
      90
    ],
    "ɽ": [
      91
    ],
    "ɾ": [
      92
    ],
    "ʀ": [
      93
    ],
    "ʁ": [
      94
    ],
    "ʂ": [
      95
    ],
    "ʃ": [
      96
    ],
    "ʄ": [
      97
    ],
    "ʈ": [
      98
    ],
    "ʉ": [
      99
    ],
    "ʊ": [
      100
    ],
    "ʋ": [
      101
    ],
    "ʌ": [
      102
    ],
    "ʍ": [
      103
    ],
    "ʎ": [
      104
    ],
    "ʏ": [
      105
    ],
    "ʐ": [
      106
    ],
    "ʑ": [
      107
    ],
    "ʒ": [
      108
    ],
    "ʔ": [
      109
    ],
    "ʕ": [
      110
    ],
    "ʘ": [
      111
    ],
    "ʙ": [
      112
    ],
    "ʛ": [
      113
    ],
    "ʜ": [
      114
    ],
    "ʝ": [
      115
    ],
    "ʟ": [
      116
    ],
    "ʡ": [
      117
    ],
    "ʢ": [
      118
    ],
    "ʲ": [
      119
    ],
    "ˈ": [
      120
    ],
    "ˌ": [
      121
    ],
    "ː": [
      122
    ],
    "ˑ": [
      123
    ],
    "˞": [
      124
    ],
    "β": [
      125
    ],
    "θ": [
      126
    ],
    "χ": [
      127
    ],
    "ᵻ": [
      128
    ],
    "ⱱ": [
      129
    ],
    "0": [
      130
    ],
    "1": [
      131
    ],
    "2": [
      132
    ],
    "3": [
      133
    ],
    "4": [
      134
    ],
    "5": [
      135
    ],
    "6": [
      136
    ],
    "7": [
      137
    ],
    "8": [
      138
    ],
    "9": [
      139
    ],
    "̧": [
      140
    ],
    "̃": [
      141
    ],
    "̪": [
      142
    ],
    "̯": [
      143
    ],
    "̩": [
      144
    ],
    "ʰ": [
      145
    ],
    "ˤ": [
      146
    ],
    "ε": [
      147
    ],
    "↓": [
      148
    ],
    "#": [
      149
    ],
    "\"": [
      150
    ],
    "↑": [
      151
    ]
  },
  "num_symbols": 256,
  "num_speakers": 1,
  "speaker_id_map": {},
  "piper_version": "1.0.0",
  "language": {
    "code": "de_DE",
    "family": "de",
    "region": "DE",
    "name_native": "Deutsch",
    "name_english": "German",
    "country_english": "Germany"
  },
  "dataset": "thorsten"
 }
--- a/main_bak.py
+++ b/main_bak.py
@@ -0,0 +1,227 @@
 import threading
 import queue
 import json
 import time
 import os
 import subprocess
 # =========================
 # KONFIGURATION
 # =========================
 VOSK_MODEL_PATH = "vosk-model-de-0.21"
 PIPER_BIN = "piper"
 PIPER_MODEL = "de_DE-thorsten-medium.onnx"
 SAMPLE_RATE = 22050
 # =========================
 # STATES
 # =========================
 STATE_IDLE = "IDLE"
 STATE_LISTENING = "LISTENING"
 STATE_SPEAKING = "SPEAKING"
 # =========================
 # GLOBALER ZUSTAND
 # =========================
 state = STATE_IDLE
 context = {
    "intent": None,
    "slots": {},
    "required_slots": [],
    "pending_slot": None
 }
 audio_queue = queue.Queue()
 # =========================
 # TTS (PIPER)
 # =========================
 def speak(text):
    global state
    state = STATE_SPEAKING
    print(f"[TTS] {text}")
    process = subprocess.Popen(
        [PIPER_BIN, "--model", PIPER_MODEL, "--output-raw"],
        stdin=subprocess.PIPE,
        stdout=subprocess.PIPE
    )
    audio = process.communicate(input=text.encode("utf-8"))[0]
    play = subprocess.Popen(
        ["aplay", "-r", str(SAMPLE_RATE), "-f", "S16_LE"],
        stdin=subprocess.PIPE
    )
    play.communicate(audio)
    state = STATE_LISTENING
 # =========================
 # INTENTS & SLOTS
 # =========================
 INTENTS = {
    "weather": {
        "keywords": ["wetter", "temperatur", "regen"],
        "required_slots": ["location"]
    },
    "timer": {
        "keywords": ["timer"],
        "required_slots": ["duration"]
    }
 }
 # evtl mit regex überarbeiten
 def detect_intent(text):
    text = text.lower()
    for name, data in INTENTS.items():
        if any(word in text for word in data["keywords"]):
            return name
    return None
 # =========================
 # SKILLS
 # =========================
 def weather_skill(slots):
    location = slots["location"]
    return f"Das Wetter in {location} ist sonnig bei 20 Grad."
 def timer_skill(slots):
    duration = slots["duration"]
    return f"Der Timer für {duration} Minuten wurde gestartet."
 SKILLS = {
    "weather": weather_skill,
    "timer": timer_skill
 }
 # =========================
 # DIALOGLOGIK
 # =========================
 def handle_text(text):
    global context, state
    if state != STATE_LISTENING:
        return
    print(f"[STT] {text}")
    # 1. Rückfrage beantworten
    if context["pending_slot"]:
        context["slots"][context["pending_slot"]] = text
        context["pending_slot"] = None
    # 2. Intent erkennen
    if not context["intent"]:
        intent = detect_intent(text)
        if not intent:
            speak("Das habe ich nicht verstanden.")
            reset_context()
            return
        context["intent"] = intent
        context["required_slots"] = INTENTS[intent]["required_slots"]
    # 3. Fehlende Slots prüfen
    for slot in context["required_slots"]:
        if slot not in context["slots"]:
            context["pending_slot"] = slot
            ask_for_slot(slot)
            return
    # 4. Skill ausführen
    result = SKILLS[context["intent"]](context["slots"])
    speak(result)
    reset_context()
 def ask_for_slot(slot):
    questions = {
        "location": "Für welchen Ort?",
        "duration": "Wie lange soll der Timer laufen?"
    }
    speak(questions.get(slot, "Bitte spezifizieren."))
 def reset_context():
    global context, state
    context = {
        "intent": None,
        "slots": {},
        "required_slots": [],
        "pending_slot": None
    }
    state = STATE_IDLE
 # =========================
 # VOSK LISTENER
 # =========================
 def vosk_listener():
    import vosk
    import pyaudio
    model = vosk.Model(VOSK_MODEL_PATH)
    rec = vosk.KaldiRecognizer(model, SAMPLE_RATE)
    p = pyaudio.PyAudio()
    stream = p.open(
        format=pyaudio.paInt16,
        channels=1,
        rate=SAMPLE_RATE,
        input=True,
        frames_per_buffer=4000
    )
    stream.start_stream()
    while True:
        data = stream.read(4000, exception_on_overflow=False)
        if rec.AcceptWaveform(data):
            result = json.loads(rec.Result())
            text = result.get("text", "")
            if text:
                audio_queue.put(text)
 # =========================
 # WAKEWORD (SIMPLIFIZIERT)
 # =========================
 def fake_wakeword_detector():
    global state
    while True:
        if state == STATE_IDLE:
            time.sleep(0.1)
            state = STATE_LISTENING
            speak("Wie kann ich helfen?")
 # =========================
 # MAIN LOOP
 # =========================
 def main():
    threading.Thread(target=vosk_listener, daemon=True).start()
    threading.Thread(target=fake_wakeword_detector, daemon=True).start()
    while True:
        try:
            text = audio_queue.get(timeout=0.1)
            handle_text(text)
        except queue.Empty:
            pass
 if __name__ == "__main__":
    main()
--- a/test_tts.py
+++ b/test_tts.py
@@ -0,0 +1,27 @@
 import subprocess
 PIPER_BIN = "piper"
 PIPER_MODEL = "de_DE-thorsten-medium.onnx"
 SAMPLE_RATE = 22050 #aplay -v assistant/audio-tts/predefined.wav so rausgefunden
 def speak(text):
    print(f"[TTS] {text}")
    process = subprocess.Popen(
        [PIPER_BIN, "--model", PIPER_MODEL, "--output-raw"],
        stdin=subprocess.PIPE,
        stdout=subprocess.PIPE
    )
    audio = process.communicate(input=text.encode("utf-8"))[0]
    play = subprocess.Popen(
        ["aplay", "-r", str(SAMPLE_RATE), "-f", "S16_LE"],
        stdin=subprocess.PIPE
    )
    play.communicate(audio)
 text="Das Auto musste repariert werden, bevor wir weiterfahren konnten, neuer text"
 speak(text)
--- a/test_vosk.py
+++ b/test_vosk.py
@@ -0,0 +1,46 @@
 import threading
 import queue
 import json
 audio_queue = queue.Queue()
 SAMPLE_RATE = 16000
 def vosk_listener():
    import vosk
    import pyaudio
    VOSK_MODEL_PATH = "/home/tino/Documents/_Documents/Schule/4 Wilhelm Maybach Schule/2BKI Jahr 2/Abschlussprojekt/test assistant/assistant_all_in_one/vosk-model-de-0.21/"
    model = vosk.Model(VOSK_MODEL_PATH)
    rec = vosk.KaldiRecognizer(model, SAMPLE_RATE)
    p = pyaudio.PyAudio()
    stream = p.open(
        format=pyaudio.paInt16,
        channels=1,
        rate=SAMPLE_RATE,
        input=True,
        frames_per_buffer=4000
    )
    stream.start_stream()
    while True:
        data = stream.read(4000, exception_on_overflow=False)
        if rec.AcceptWaveform(data):
            result = json.loads(rec.Result())
            text = result.get("text", "")
            if text:
                audio_queue.put(text)
 def main():
    threading.Thread(target=vosk_listener, daemon=True).start()
    while True:
        try:
            text = audio_queue.get(timeout=0.1)
            print("[SST]", text)
        except queue.Empty:
            pass
 if __name__ == "__main__":
    main()