RasPi_Voice_Assistant--WIP/main.py

import threading
import queue
import json
import time
import os
import subprocess
import sounddevice as sd
import re
import asyncio

#test

# =========================
# KONFIGURATION
# =========================

VOSK_MODEL_PATH = "/home/tino/Desktop/Abschlussprojekt/test assistant/cloneAssistantAllInOne/vosk-model-de-0.21/"
PIPER_BIN = "piper"
PIPER_MODEL = "de_DE-thorsten-medium.onnx"
SAMPLE_RATE = 22050

# =========================
# STATES
# =========================

STATE_IDLE = "IDLE"
STATE_LISTENING = "LISTENING"
STATE_SPEAKING = "SPEAKING"

# =========================
# GLOBALER ZUSTAND
# =========================

state = STATE_IDLE
context = {
    "intent": None,
    "slots": {},
    "required_slots": [],
    "pending_slot": None,
    "action": None,
}

audio_queue = queue.Queue()

# =========================
# TTS (PIPER)
# =========================

def speak(text):
    global state
    state = STATE_SPEAKING
    print(f"[TTS] {text}")

    process = subprocess.Popen(
        [PIPER_BIN, "--model", PIPER_MODEL, "--output-raw"],
        stdin=subprocess.PIPE,
        stdout=subprocess.PIPE
    )

    audio = process.communicate(input=text.encode("utf-8"))[0]

    play = subprocess.Popen(
        ["aplay", "-r", str(SAMPLE_RATE), "-f", "S16_LE"],
        stdin=subprocess.PIPE
    )
    play.communicate(audio)

    state = STATE_LISTENING


# =========================
# INTENTS & SLOTS
# =========================

INTENTS = {
    "weather": {
        "keywords": ["wetter", "temperatur", "regen"],
        "required_slots": {
            "location": r"\bin\b\s*(\w+)"
        },
        "subactions": ["info"]
    },
    "timer": {
        "keywords": ["timer"],
        "required_slots": {
            "duration": r"(sekunde|minute|stunde)"
        },
        "subactiions": ["start", "stop", "status"]
    }
}

# evtl mit regex überarbeiten

def detect_intent(text):
    text = text.lower()
    for name, data in INTENTS.items():
        if any(word in text for word in data["keywords"]):
            return name
    return None


# =========================
# SKILLS
# =========================

from weather_jetzt import get_weather_for_location

def weather_skill(slots):
    location = slots["location"]
    result = asyncio.run(get_weather_for_location(location))

    if result:
        return f"Aktuell sind es in {result['location']} {result['temperatur']} Grad und die Wetterlage sieht {result['wetterlage']} aus."
    else:
        return f"Keine Wetterdaten verfügbar"
    #return f"Das Wetter in {location} ist sonnig bei 20 Grad."

def timer_skill(slots):
    duration = slots["duration"]
    return f"Der Timer für {duration} Minuten wurde gestartet."


SKILLS = {
    "weather": weather_skill,
    "timer": timer_skill
}

# =========================
# DIALOGLOGIK
# =========================

def handle_text(text):
    global context, state

    if state != STATE_LISTENING:
        return

    print(f"[STT] {text}")

    # 1. Rückfrage beantworten
    # if context["pending_slot"]:
    #     context["slots"][context["pending_slot"]] = text
    #     context["pending_slot"] = None

    # 2. Intent erkennen
    if not context["intent"]:
        intent = detect_intent(text)
        if not intent:
            speak("Das habe ich nicht verstanden.")
            reset_context()
            return

        context["intent"] = intent
        context["required_slots"] = INTENTS[intent]["required_slots"] # man könnte per liste drüber iterieren wenn man mehrere required slots hat

    if not check_required(text):
        return

    # 3. Fehlende Slots prüfen
    # for slot in context["required_slots"]:
    #     if slot not in context["slots"]:
    #         context["pending_slot"] = slot
    #         ask_for_slot(slot)
    #         return

    # 4. Skill ausführen


    result = SKILLS[context["intent"]](context["slots"])
    speak(result)
    reset_context()

def check_required(text):
    intent_data = INTENTS[context["intent"]]
    text = text.lower()

    for slot, pattern in intent_data.get("required_slots", {}).items():
        if slot not in context["slots"]:
            match = re.search(pattern, text)
            if match:
                context["slots"][slot] = match.group(1) # schau an
            else:
                context["pending_slot"] = slot
                ask_for_slot(slot)
                return False

    for action in intent_data.get("subactions", []):
        if action in text:
            context["slots"]["action"] = action

    context["pending_slot"] = None
    return True


def ask_for_slot(slot):
    questions = {
        "location": "Für welchen Ort?",
        "duration": "Wie lange soll der Timer laufen?"
    }
    speak(questions.get(slot, "Bitte spezifizieren."))


def reset_context():
    global context, state
    context = {
        "intent": None,
        "slots": {},
        "required_slots": [],
        "pending_slot": None
    }
    state = STATE_IDLE


# =========================
# VOSK LISTENER
# =========================

def vosk_listener():
    SAMPLE_RATE_VOSK = 16000
    from vosk import Model, KaldiRecognizer
    import pyaudio

    model = Model(VOSK_MODEL_PATH)
    rec = KaldiRecognizer(model, SAMPLE_RATE_VOSK)

    p = pyaudio.PyAudio()
    stream = p.open(
        format=pyaudio.paInt16,
        channels=1,
        rate=SAMPLE_RATE_VOSK,
        input=True,
        frames_per_buffer=4000
    )
#    stream.start_stream()

    while True:
        if state != STATE_SPEAKING: #hinzugefügt um fehlerhafte eingaben zu stoppen
            data = stream.read(4000, exception_on_overflow=False)
            if rec.AcceptWaveform(data):
                result = json.loads(rec.Result())
                text = result.get("text", "")
                if text:
                    audio_queue.put(text)
        else:
            rec.Reset()


# =========================
# WAKEWORD (SIMPLIFIZIERT)
# =========================
"""
def fake_wakeword_detector():
    global state
    while True:
        if state == STATE_IDLE:
            time.sleep(0.1)
            state = STATE_LISTENING
            speak("Wie kann ich helfen?")
 """
# ==========================
# WAKEWORD (PORCUPINE)
# ==========================
def real_wakeword_detector():
    import pvporcupine
    import numpy as np

    global state

    ACCESS_KEY = "lpz+8e9omUnQtCQPeaawZauxVRqdhbcDH3fz19oZsp7zXKflWCiYMw=="
    WAKEWORD = "jarvis"   # built-in wake word

    porcupine = pvporcupine.create(
        access_key=ACCESS_KEY,
        keywords=[WAKEWORD]
    )
    if state == STATE_IDLE:

        def callback(indata, frames, time_info, status):
            pcm = np.frombuffer(indata, dtype=np.int16)
            result = porcupine.process(pcm)
            if result >= 0:
                time.sleep(1)  #verbesserung der spracheingabe: wurde hinzugefügt weil es sonst worte halluziniert (wie "eine", "jarvis")
                state = STATE_LISTENING
                print("WAKE WORD DETECTED")
                speak("Ja, wie kann ich helfen?")


        with sd.InputStream(
            samplerate=porcupine.sample_rate,
            channels=1,
            dtype="int16",
            blocksize=porcupine.frame_length,
            callback=callback,
        ):
            print("Listening...")
            while True:
                pass


# =========================
# MAIN LOOP
# =========================

def main():
    threading.Thread(target=vosk_listener, daemon=True).start()
#    threading.Thread(target=fake_wakeword_detector, daemon=True).start()
    threading.Thread(target=real_wakeword_detector, daemon=True).start()

    while True:
        try:
            text = audio_queue.get(timeout=0.1)
            handle_text(text)
        except queue.Empty:
            pass


if __name__ == "__main__":
    main()