Skip to content

Audio Visualizer


  • Class name: SaltAudioVisualizer
  • Category: SALT/AudioViz/Audio/Util
  • Output node: True

Provides a visual representation of audio data, enabling users to visualize the waveform or spectrum of audio clips. This can be particularly useful for analyzing the audio's structure, identifying specific parts, or simply for aesthetic visualization purposes.

Input types


  • audio
    • The audio data to be visualized. It is crucial for understanding the audio's structure and for generating its visual representation.
    • Comfy dtype: AUDIO
    • Python dtype: np.ndarray
  • frame_rate
    • Defines the number of frames per second for the visualization, affecting the smoothness and detail of the output.
    • Comfy dtype: INT
    • Python dtype: int


  • start_frame
    • Specifies the starting frame for the visualization, allowing for partial visualization of the audio clip.
    • Comfy dtype: INT
    • Python dtype: int
  • end_frame
    • Determines the ending frame for the visualization, enabling users to focus on a specific segment of the audio clip.
    • Comfy dtype: INT
    • Python dtype: int

Output types

The node doesn't have output types

Usage tips

  • Infra type: CPU
  • Common nodes: unknown

Source code

class SaltAudioVisualizer:
    def INPUT_TYPES(cls):
        return {
            "required": {
                "audio": ("AUDIO", {}),
                "frame_rate": ("INT", {"default": 8, "min": 1, "max": 244}),
            "optional": {
                "start_frame": ("INT", {"min": 0, "default": 0}),
                "end_frame": ("INT", {"min": 0, "default": -1}),

    OUTPUT_NODE = True

    FUNCTION = "visualize_audio"

    def visualize_audio(self, audio, frame_rate, start_frame=0, end_frame=-1):
        TEMP = folder_paths.get_temp_directory()
        os.makedirs(TEMP, exist_ok=True)

        audio_segment = AudioSegment.from_file(io.BytesIO(audio), format="wav", dir=TEMP)

        frame_duration_ms = 1000 / frame_rate
        start_ms = start_frame * frame_duration_ms
        end_ms = end_frame * frame_duration_ms if end_frame != -1 else len(audio_segment)

        relevant_audio_segment = audio_segment[start_ms:end_ms]

        samples = np.array(relevant_audio_segment.get_array_of_samples())
        if relevant_audio_segment.channels == 2:
            samples = samples.reshape((-1, 2))
            samples = samples.sum(axis=1) / 2

        max_val = max(abs(samples.min()), samples.max())
        normalized_samples = samples / max_val

        total_frames = len(normalized_samples) / (audio_segment.frame_rate / frame_rate)
        frame_numbers = np.linspace(start_frame, start_frame + total_frames, num=len(normalized_samples), endpoint=False)

        plt.figure(figsize=(10, 4))
        plt.plot(frame_numbers, normalized_samples, linewidth=0.5)
        plt.title("Audio Visualization")
        plt.ylim(-1, 1) 

        filename = str(uuid.uuid4())+"_visualization.png"
        file_path = os.path.join(TEMP, filename)

        ui_output = {
            "ui": {
                "images": [
                        "filename": filename,
                        "subfolder": "",
                        "type": "temp"

        return ui_output

    def gen_hash(input_dict):
        sorted_json = json.dumps(input_dict, sort_keys=True)
        hash_obj = hashlib.sha256()
        return hash_obj.hexdigest()

    def IS_CHANGED(cls, **kwargs):
        return cls.gen_hash(kwargs)