Skip to content

Program records speech without needing the wakeword after the first loop #296

@Shahd-R

Description

@Shahd-R

Built a program using a local LLM that requires the wake word i trained to listen to user questions, which then feeds the input into the LLM and the output is fed into the elevenlabs API for the speech to text.
My issue is that when the program first runs it works perfectly fine, waits for the wakeword to start listening, after that it basically ignores the need for a wakeword and just records everything it hears.

Text Processing

def process_text(text):
    global chat_history
    global detected

    if not text.strip():
        print("No speech detected. Skipping...")
        return

    text = fix_acronyms(text)
    print(f"Recorded text: {text}")
    #print("Hellooooo")

     #Add to history
    chat_history += f"\nUser: {text}\n"
    chat_history = trim_chat_history_to_token_limit(chat_history, prompt_template)
    #print(chat_history.splitlines()[-1])
    chat_recent = chat_history.splitlines()[-1]
   

    # Main model response
    prompt = f"{prompt_template}\n{chat_recent}Assistant:"
    response = llm(prompt, stop=["User:", "</s>", "[INST]", "[/INST]","Assistant:"], max_tokens=200)
    assistant_reply = response["choices"][0]["text"].strip()

    locc = ""
    for char in assistant_reply:
        if char == ":":
            break
        else:
            locc +=char

    low_conf= "Unfortunately,"
    words = assistant_reply.split()
    #print(words)

        ####################### WEB SEARCH CAPABILITY ######################
        # Can be used to use DuckDuckGo for looking up the question when the llms confidence is low
        # Outputs the first web result it finds
        #If removed, the AI will instead apologise and state that they are unsure of the answer instead.
    
        
    elif low_conf in words:
        print(" Low confidence detected ; using web search...")
        global search_query
        global yay
        search_query= locc
        results = search_client.text(
            keywords=search_query,
            region="wt_wt",
            safesearch="off",
            max_results=1
        )
        print(results)
        output= results[0]["body"]
        
        print(f"\nAssistant says:" + results[0]["body"])
       # play_response(output)
       # start_recorder()
    
    #######################################################################################
             
    else:   
        print(f"\nAssistant says: {assistant_reply}")
      #  play_response(assistant_reply)
       # start_recorder()

STT function

def start_recorder():
    global detected
    detected = False

    def on_wakeword_detected():
        print("\nWake word detected!")
        global detected
        detected = True
    
    def on_recording_start():
        time.sleep(0.5)

        print("Recording...")

    def on_wakeword_detection_start():
        print('\nSay "Hello Wolfy" to begin.')

    def on_wakeword_timeout():
        global detected
        if not detected:
            print(f"Timeout. Say 'Hello Wolfy' to begin.")

    with AudioToTextRecorder(
        wake_words="Hello_wolfy!", # MUST match your ONNX label
        openwakeword_model_paths=onnx_path,
        openwakeword_inference_framework="onnx",
        wakeword_backend="oww",
     
        on_wakeword_detected=on_wakeword_detected,
        on_recording_start=on_recording_start,
        on_wakeword_timeout=on_wakeword_timeout,
        on_wakeword_detection_start=on_wakeword_detection_start,
        post_speech_silence_duration=1,
        min_length_of_recording=2.0,
        wake_word_activation_delay=0.5,
    ) as recorder:

        while (True):
            text = recorder.text()
            if text:
                process_text(text)
                time.sleep(1)

I can recognise that my issue is that the function start_recorder() is only called once and so a wakeword is only needed once, but i have no clue how to seperate the need for a wakeword everytime from just straight up having to rebuild the recorder everytime the audio stream from the speech to text finishes.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions