import React from 'react'
import BlogList from './BlogList/BlogList'
import "./Blog.css"
import approach from "./images/approach.webp"
import webgraph1 from "./images/webgraph1.webp"
import webgraph2 from "./images/webgraph2.webp"
import webgraph3 from "./images/webgraph3.webp"
import webgraph4 from "./images/webgraph4.webp"
import llamabitnet1 from "./images/llamabitnet1.webp"
import llamabitnet2 from "./images/llamabitnet2.webp"
import llamabitnet3 from "./images/llamabitnet3.webp"
import llamabitnet4 from "./images/llamabitnet4.webp"
import llamabitnet5 from "./images/llamabitnet5.webp"
import llamabitnet6 from "./images/llamabitnet6.webp"
import llamabitnet7 from "./images/llamabitnet7.webp"
import orpo1 from "./images/orpo1.webp"
import orpo2 from "./images/orpo2.webp"
import orpo3 from "./images/orpo3.webp"
import gemma1 from "./images/gemma1.webp"
import gemma2 from "./images/gemma2.webp"
import gemma3 from "./images/gemma3.webp"
import gemma4 from "./images/gemma4.webp"
import gemma5 from "./images/gemma5.webp"
import slm1 from "./images/slm1.webp"
import slm2 from "./images/slm2.webp"
import slm3 from "./images/slm3.webp"
import slm4 from "./images/slm4.webp"
import slm5 from "./images/slm5.webp"
import slm6 from "./images/slm6.webp"
import slm7 from "./images/slm7.webp"
import slm8 from "./images/slm8.webp"
import andr1 from "./images/andrej01.webp"
import andr2 from "./images/andrej02.webp"
import andr3 from "./images/andrej03.webp"
import andr4 from "./images/andrej04.webp"
import andr5 from "./images/andrej05.webp.png"
import andr6 from "./images/andrej06.webp"
import andr7 from "./images/andrej07.webp"

const posts_ = [
    {
        id: "queryloop0001",
        title: "Schedule-Free Learning — A New Way to Train Models",
        author: "Queryloop",
        date: "04/19/2024",
        excerpt: "Training 3 Llama models for comparison of Cosine Scheduled and Schedule-Free optimizer.",
        content: "In the realm of machine learning, we are continuously relying on the intricate algorithms and techniques to train our models effectively.",
        parts: [
            { "heading": "Scheduled Learning", "description": "Scheduled Learning In the realm of machine learning, we are continuously relying on the intricate algorithms and techniques to train our models effectively. Among them, the learning rate stands out as a pivotal factor influencing the model's convergence and performance. Traditionally, we have turned to learning rate schedulers as our trusted allies to reach the goal for optimization. There are several types of learning rate schedulers, including step decay, exponential decay, and cosine annealing. Surely, you would have come across them in the past. To give you a little idea about these learning rate (lr) schedulers; they simply help speed up training and improve model generalization. By dynamically adjusting the learning rate, they can help the model escape local minima and find a better global minimum." },
            { "heading": "Scheduled-Free Learning", "description": "However, it's time to explore a different path — a path that frees us from the confines of learning rate schedulers i.e. Scheduled-Free Learning. With schedule-free optimizers, training is faster as there is no need to specify the stopping time/steps in advance." },
            { "space": "Recently, facebookresearch open-sourced their schedule-free optimizers. There are 2 primary implementations currently:", "list": ["SGDScheduleFree", "AdamWScheduleFree"] },

            { "heading": "Approach", "description": "Schedule-Free learning replaces the momentum of an underlying optimizer with a combination of interpolation and averaging. In the case of gradient descent, the Schedule-free update is:", img: approach },
            { "heading": "Update Step", "description": "Here x is the sequence that evaluations of test/val loss should occur at, which differs from the primary iterates z and the gradient evaluation locations y. The updates to z correspond to the underlying optimizer, in this case is a simple gradient step." },
            { "heading": "How Scheduled-Free Learning is better?", "description": "As the name suggests, Schedule-free learning does not require a decreasing learning rate schedule, yet typically out-performs, or at worst matches, SOTA schedules such as cosine-decay and linear decay. Only two sequences need to be stored at a time (the third can be computed from the other two on the fly) so this method has the same memory requirements as the base optimizer (parameter buffer + momentum)." },
            { "heading": "Constraints", "description": "Some constraints that may be beneficial when using schedule-free learning to produce better results.", "list": ["There is no need to use a learning rate scheduler, however the code is compatible with one", "Using learning rate warmup is recommended.", "This method does require tuning — it won't necessarily out-perform a schedule approach without also tuning regularization and learning rate parameters.", "For SGD, a learning rate 10x-50x larger than classical rates seems to be a good starting point.", "For AdamW, learnings rates in the range 1x-10x larger than with schedule based approaches seem to work.", "Training is more sensitive to the choice of betas than you may expect from standard momentum. The default of 0.9 works on most problems but it may be necessary to increase the value to 0.95 or 0.98 particularly for very long training runs."] },
            { "heading": "Experiments", "description": "I have performed 3 experiments to verify the claims of scheduled-free learning. 2 experiments were performed with scheduled optimizer but different learning rates were used. In the 3 experiment, I have compared the performance of scheduled-free training with a scheduled one. You can track the performance through the provided wandb graphs using the following names", "list": ["AdamW Schedule Free", "AdamW Schedule Free-2", "AdamW (Scheduled-cosine)"], "desc": "For the experiments, I leveraged 2x RTX 5000 GPUs, offering substantial computational muscle. Alongside, the LLama-60m model was used as a unifying thread across all our trials." },
            { "heading": "AdamW Schedule Free", "description": "In this experiment, I have used a Schedule-free AdamW optimizer with a learning rate of 6.0e-4, weight_decay of 0.1, Betas were set to 0.9 and 0.95.", "space": "Analyzing the graphs, it gave a throughput of 127060 tok/s, highest among the 3 experiments. Following losses were observed in this experiment", "list": ["CrossEntropyLoss: 0.3302", "Perplexity: 1.695", "Aux_loss: 0.2605"] },
            { "heading": "AdamW Schedule Free-2", "description": "In this experiment, I have used Schedule-free AdamW optimizer with a learning rate of 5.0e-3 (10x less than AdamW scheduled), weight_decay of 0.1, Betas were set to 0.9 and 0.95.", "space": "Analyzing the graphs, it gave a throughput of 118487 tok/s, lowest among the 3 experiments. Following losses were observed in this experiment", "list": ["CrossEntropyLoss: 0.1329", "Perplexity: 1.728", "Aux_loss: 0.2603"] },
            { "heading": "AdamW (Scheduled-cosine)", "description": "In this experiment, I have used Schedule-free AdamW optimizer with a learning rate of 5.0e-4, weight_decay of 0.1, Betas were set to 0.9 and 0.95. Warm_up steps was set to 2000, alpha_f was set to 0.1.", "space": "Analyzing the graphs, it gave a throughput of 120612 tok/s. Following losses were observed in this experiment", "list": ["CrossEntropyLoss: 0.6391", "Perplexity: 1.706", "Aux_loss: 0.2511"] },
            { "heading": "Wandb Graphs", "description": "You can analyze the throughput and losses through the following graphs", "images": [webgraph1, webgraph2, webgraph3, webgraph4] },
            { "heading": "Conclusion", "description": "Through experiments, I have highlighted the benefits of scheduler-free learning, including its simplicity, flexibility, and ability to enhance model performance. Due to limited compute, I have only experimented with a 60M model. In terms of speed, I have observed the fastest speed (tps) in a scheduler-free optimizer with lr equal to 6.0e-5. Observing the evaluation across downstream tasks and loss curves also suggests that schedule-free optimizer and using constant lr schedule has improved convergence." },
            { "tags": ["AI", "LLM", "Machine Learning", "NLP", "Queryloop"] }
        ]
    },
    {
        id: "queryloop0002",
        title: "Llama-Bitnet | Training a 1.58 bit LLM",
        author: "Queryloop",
        date: "04/04/2024",
        excerpt: "What is 1 bit LLM and How to train 70M Llama-Bitnet?",
        content: "Vanilla LLMs built upon the Transformer architecture typically operate in 16-bit precision (FP-16 or BF-16) and hence the major computation costs account for the floating point matrix addition and multiplication operations...",
        parts: [
            { "heading": "Introduction", "description": "Vanilla LLMs built upon the Transformer architecture typically operate in 16-bit precision (FP-16 or BF-16) and hence the major computation costs account for the floating point matrix addition and multiplication operations. Furthermore, within full-precision LLMs, loading weights from DRAM to an on-chip accelerator memory (e.g. SRAM) incurs higher costs during inference.", "space": "A popular suboptimal solution is post-training quantization which can reduce the precision down to 4 bits for better inference. Also enlarging SRAM to improve throughput imposes higher costs than DRAM." },
            { "heading": "BitNet b1.58", "description": "A significant variant of low-bit LLMs is BitNet b1.58 where all weight values are ternary, taking on values {-1, 0, 1}. Its quantization function is absmean in which, the weights are first scaled by their average absolute value and then rounded to the nearest integer ε {-1,0,1}. It is an efficient extension of 1-bit BitNet by including 0 in model parameters. BitNet b1.58 is based upon BitNet architecture (replaces nn.linear with BitLinear). It is highly optimized as it removes floating point multiplication overhead, involving only integer addition (INT-8), and efficiently loads parameters from DRAM. BitNet b1.58 continues to match full-precision Transformer LLM baselines in both perplexity and end-task performance, all while demonstrating cost-effectiveness in terms of latency, memory, throughput, and energy consumption.", img: llamabitnet1, "space": "BitNet b1.58 uses RMSNorm, SwiGLU, and rotary embedding, removes all biases, and hence can be easily integrated into HuggingFace, vLLM, and llama.cpp." },
            { "heading": "Can b1.58 LLMs replace Float 16 Models?", "description": "The authors of BitNet b1.58 compared it with a reproduced FP16-LLaMA by pretraining both models with the same configurations and evaluated the zero-shot performance on various language tasks. The results reveal that BitNet b1.58 starts to match LLaMA at 3B model size and continues to narrow the performance gap onwards, outperforming full-precision models on perplexity and end-task results. Particularly, a 3.9B BitNet b1.58 was 2.4 times faster and consumed 3.32 times less memory than LLaMA 3B, thus reducing memory and latency costs. This demonstrates that BitNet b1.58 is capable of competing with the full-precision LLMs.", img: llamabitnet2 },
            { "space": "Further experiments revealed that BitNet b1.58 70B was 4.1 times faster and 8.9 times higher throughput capable than the corresponding FP16 LLaMa." },
            { "heading": "1.58 LLM Experiment Details", "description": "Nous Research trained a 1B Bitnet, OLMo-Bitnet-1B on the first 60B tokens of the Dolma dataset. They also trained a standard FP16 OLMo-1B model with the same training configurations to compare performance. The wandb report reveals :", "space": "OLMo-1B reported slightly better perplexity and cross-entropy loss than OLMo-Bitnet-1B on all dogma dataset subsets including small_dogma_stack, small_pile, small_dogma_crawl, small_c4_en, small-m2d2_s2orc, small-wikitext_103, small-dolma_reddit, small-dolma_books, small_ice, small-dolma_pes2o.", img: llamabitnet3 },
            { "space": "Similarly, OLMo-1B scores on end-tasks were moderately higher than OLMo-Bitnet-1B.", img: llamabitnet4 },
            { "space": "GPU memory consumption of both LLMs was also almost identical.", img: llamabitnet5 },
            { "heading": "Training 70M LLama Bitnet", "description": "The model was trained for 2 epochs using configurations of NousResearch/Llama-2–7b-hf using dataset abideen/Cosmopedia-100k-pretrain on 1xA100 for almost 2 hours. The training parameters used are below:", "list": ["Learning Rate: 1.5e-3", "Warmup Steps: 0.1", "Number of Training Epochs: 2", "Per Device Training Batch Size: 20", "dimension: 768", "logging steps: 100", "weight decay: 0.01", "lr_scheduler type: cosine", "save steps: 0.25", "fp16: True", "context length: 256", "Gradient Accumulation Steps: 2", "Number of Processes: 1"], "space": "The training process has also been logged to Weights and Biases. Some of the graphs are shown below:", img: llamabitnet6 },
            { "heading": "Small snippet of training code is given below:" },
            {
                "code": `### Create the llama model with custom config. Convert it to bitnet.
            model = LlamaForCausalLM(config)
            convert_to_bitnet(model, copy_weights=False)
            model_size = sum(t.numel() for t in model.parameters())
            print(f"Model size: {model_size/1000**2:.1f}M parameters")
            tokenizer.pad_token = tokenizer.eos_token
            data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
            
            output_path = "./out"
            args = TrainingArguments(
                output_dir=output_path,
                per_device_train_batch_size=BATCH_SIZE,
                logging_steps=100,
                gradient_accumulation_steps=2,
                num_train_epochs=EPOCHS,
                weight_decay=0.01,
                warmup_steps=0.1,
                lr_scheduler_type="cosine",
                learning_rate=LEARNING_RATE,
                save_steps=0.25,
                fp16=True,
                report_to="wandb"
            )
            
            trainer = Trainer(
                model=model,
                tokenizer=tokenizer,
                args=args,
                data_collator=data_collator,
                train_dataset=tokenized_data["train"],
            )
            
            trainer.train()
            trainer.save_model(f"{output_path}/final_model")
            folder = "./out/final_model"
            api = HfApi()
            create_repo(
                repo_id = f"{HUGGINGFACE_ID}/{NEW_MODEL}",
                repo_type="model",
                exist_ok=True,
                token=HF_TOKEN,
            )
            
            # Upload Model files
            api.upload_folder(
                folder_path=folder,
                repo_type="model",
                repo_id=f"{HUGGINGFACE_ID}/{NEW_MODEL}",
                token=HF_TOKEN,
            )`},
            { "heading": "AutoBitnet", "description": "AutoBitnet is an automated tool that allows you to train a BitNet b1.58 on the baselines of any LLaMA architecture on a colab T4 GPU." },
            { "space": "", img: llamabitnet7 },
            { "tags": ["AI", "NLP", "Computer Vision", "Machine Learning", "Deep Learning", "Queryloop"] },
        ]
    },
    {
        id: "queryloop0003",
        title: "ORPO Outperforms SFT+DPO | Train Phi-2 with ORPO",
        author: "Queryloop",
        date: "03/22/2024",
        excerpt: "Train Phi-2 with ORPO with LazyOrpo",
        content: "Before jumping into ORPO, I am going to assume that you are well-acquainted with the process of fine-tuning LLMs for optimal performance. One of the most common technique used for fine-tuning is the Supervised Fine-Tuning (SFT)...",
        parts: [
            { "heading": "Introduction", "description": "Before jumping into ORPO, I am going to assume that you are well-acquainted with the process of fine-tuning LLMs for optimal performance. One of the most common technique used for fine-tuning is the Supervised Fine-Tuning (SFT). The most common way for doing SFT is to load the model in 4-bit and apply the config to the model for Lora training. Then we use TRL’s SFTTrainer to fine-tune models. That’s one way of reaching an optimal LLM. Another technique that has been here for some time now is the DPO (Direct Preference Optimization). For DPO, the dataset should be in a specific format i.e. it should contain a chosen response and a rejected response along with the instruction. DPO has shown great results in aligning the model while requiring less compute for the training process. To further improve the model’s performance, recently people have adopted to SFT followed by DPO on the same model. This combination of SFT+DPO has proved to be quite effective but at the same time requires more compute resources.", "space": "What if I tell you, there is another better fine-tuning technique that can replace both SFT+DPO and have shown promising results. I am referring to ORPO (Odds Ratio Preference Optimization). The main highlight is its loss function. It incorporates an odds ratio-based penalty to the conventional negative log-likelihood (NLL) loss for differentiating the generation styles between favored and disfavored responses." },
            { "heading": "Can ORPO redefine how we train and align LLMs for RLHF?", "description": "State-of-the-art LLMs followed the process of Base Model → Supervised Fine-tuning → RLHF (PPO/DPO). This is very resource-intensive and complex. Odds Ratio Preference Optimization (ORPO) proposes a new method to train LLMs by combining SFT and Alignment into a new objective (loss function), achieving state of the art results. DPO not only reduces the cost of the training but also outperforms the results from first fine-tuning the model and then doing RLHF (DPO) on the fine-tuned version. ORPO does not require a reference model, unlike RLHF and DPO. In that sense, ORPO is computationally more efficient than RLHF and DPO in two perspectives:", "list": ["Memory allocation", "Fewer FLOPs per batch."] },
            { "space": "So, in my opinion the answer to the above question is most probably a “Yes”. It can certainly influence the way how we train our models in the future or may have an impact on future research work regarding fine-tuning LLMs." },
            { "heading": "ORPO details", "description": "🏆 ORPO Outperforms SFT, SFT+DPO on PHI-2, Llama 2, and Mistral 📊 Mistral ORPO achieves 12.20% on AlpacaEval2.0, 66.19% on IFEval, and 7.32 on MT-Bench Zephyr Beta", "space": "Results from the ORPO paper are impressive and to test verify the results of this paper, I decided to try it out on Phi-2 with Argilla’s dpo-mix-7k dataset. Some results from the paper are shown below.", img: orpo1 },
            { "space": "The reason for choosing Phi-2 is that because it shows an insane amount of improvement on this technique as compare to SFT+DPO." },
            { "heading": "Training process", "list": ["For implementing ORPO, we will require a dataset that is in DPO format i.e. it should have a chosen and rejected responses. Fot this experiment, we will opt for Argilla’s dpo-mix-7k preference dataset.", " Make sure the dataset doesn’t contain instances where the chosen and rejected responses are the same, or one is empty.", "Select a pre-trained LLM (e.g., Llama-2, Mistral). In this case, I have selected Phi-2 as the base model.", "Train the Base model with ORPO objective on preference dataset"] },
            { "space": "There is no extra SFT step that is directly applied to base model. The model was trained for 1 epoch on 1x A40 for almost 6 hours. The training parameters used are below:" },
            { "list": ["Learning Rate: 5e-6", "Warmup Steps: 100", "Model Name: microsoft/phi-2", "Data Name: argilla/dpo-mix-7k", "Number of Training Epochs: 1", "Maximum Length of Prompt: 128", "Maximum Length of Response: 2048", "Per Device Training Batch Size: 4", "Per Device Evaluation Batch Size: 4", "Gradient Accumulation Steps: 1", "Number of Processes: 1"] },
            { "space": "The training process has also been logged to Weights and Biases. Some of the graphs are shown below:", img: orpo2 },
            { "heading": "LazyORPO", "description": "LazyORPO (Automated tool to train your model with ORPO). ORPO is a new technique that replaces SFT+DPO. I gave ORPO a shot with Phi-2 and Argilla dpo-mix-7k yielding Phi2-pro.", "space": "Since Odds Ratio Preference Optimization (ORPO) proposes a new method to train LLMs by combining SFT and Alignment into a new objective (loss function), achieving state-of-the-art results, Orpo is not yet included in HF’s TRL, so in order to make the training phase much easier, I have made a notebook that automates the entire training process with ORPO. Just mention the base model, dataset, epochs, and learning rate to shoot the training. One thing to notice is that ORPO required more memory VRAM as I was not able to fit an 8B Gemma model on A40 48GB VRAM. So, do your calculations accordingly." },
            { "space": "A colab notebook is available for you to try it out. You can access the GPUs using RunPod." },
            { img: orpo3 },
            { "heading": "LazyORPO:", "description": "https://colab.research.google.com/drive/19ci5XIcJDxDVPY2xC1ftZ5z1kc2ah_rx?usp=sharing" },
            { "tags": ["AI", "LLM", "Machine Learning", "NLP", "Deep Learning", "Queryloop"] }
        ]
    },
    {
        id: "queryloop0004",
        title: "Multi-GPU Training of 70B LLM with Deepspeed and FSDP+Qlora",
        author: "Queryloop",
        date: "03/14/2024",
        excerpt: "Train 70–120B LLM on 4xA100s and 2xRTX3090s (Consumer-grade GPUs)",
        content: "I have been working with bigger models like Mixtral 8x7B, Qwen-120B, and Miqu-70B recently. But the most important thing when playing with bigger models is the amount of compute resources they require during training..."
        , parts: [
            { "space": "I have been working with bigger models like Mixtral 8x7B, Qwen-120B, and Miqu-70B recently. But the most important thing when playing with bigger models is the amount of compute resources they require during training. I have been using Deepspeed for multi-GPU training, understanding what difference each stage(Zero-1, Zero-2, Zero-3) brings to the table. I will also be focusing on a recent technique (FSDP+Qlora) for training larger models on consumer-grade GPUs. A few details regarding my recent experiments:" },
            { "heading": "Liberated Miqu 70B", "description": "With the release of the new dataset from Abacus AI, I tried out fine-tuning Miqu-70B on SystemChat with 2x A100s and Deepspeed Zero-2. I also tried out Deepspeed Zero-3 but with multiple issues occurring in Axolotl regarding quantization and OOM, I went back to Zero-2. Some highlights of Zero-2 are that it only divides optimizer states and gradients across GPUs but the model params are copied on each GPU while in Zero-3, model weights are also distributed across all GPUs. Liberated Miqu 70B is a totally uncensored model. So be careful with what you use it for. I trained the model for 1 epoch using Qlora with axolotl. The axolotl configuration for this experiment is shown below." },
            {
                "code": `base_model: 152334H/miqu-1-70b-sf
model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer
load_in_8bit: false
load_in_4bit: true
strict: false

datasets:
- path: abacusai/SystemChat
type: sharegpt
dataset_prepared_path:
val_set_size: 0
output_dir: /workspace/miqu-systemchat
resume_from_checkpoint:
hf_use_auth_token:
adapter:  qlora
lora_model_dir:
sequence_len: 2048
sample_packing: true
pad_to_sequence_len: true
lora_r: 16
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules:
lora_target_linear: true
lora_fan_in_fan_out:
lora_modules_to_save:
- embed_tokens
- lm_head
wandb_project: Miqu-Systemchat-multiGPU
wandb_entity: 
wandb_watch:
wandb_run_id: 
wandb_log_model:
gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 1
optimizer: paged_adamw_8bit
lr_scheduler: cosine
learning_rate: 0.0002
train_on_inputs:
group_by_length: false
bf16: true
fp16: false
tf32: false
gradient_checkpointing: true
early_stopping_patience:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
warmup_steps: 100
eval_steps: 
save_steps: 2000
save_total_limit: 2
eval_sample_packing:
debug:
deepspeed: deepspeed_configs/zero2.json
weight_decay: 0.05
fsdp:
fsdp_config:
special_tokens:
tokens:
trust_remote_code: true
            `},
            { "space": "Liberated-Miqu-70B: https://huggingface.co/abideen/Liberated-Miqu-70B" },
            { "heading": "FSDP+Qlora", "description": "Answer.ai released a new technique to train bigger models on consumer-grade GPUs (RTX 3090 or 4090) with FSDP and Qlora. Two types of hardware are normally used, one is the data center class hardware, such as H100s and A100s, and others are desktop computers containing gaming GPUs, such as dual 4090s and 3090s. The idea here was simple; figure out how to use these 10x cheaper GPUs to train the best available open-source models. Here is where Answer.ai’s fsdp+Qlora comes in handy. I gave FSDP+Qlora a shot with Mixtral 8x7B on 2x 3090s. This technique was also integrated into the Axolotl library on an experimental basis. In Answer.ai’s blog, they did not mention anything regarding speed and time constraints with consumer-grade GPUs. I set out training Mixtral on only 100 steps to try things out but the time required for that was 70 hrs which is huge. Since the experiment was taking such a long time, it was not feasible for me to complete this experiment. So currently, I am moving back to using A100s until this technique becomes more efficient. Btw, a great effort by Jeremy Howard and his team to bring the training of larger models to consumer-grade GPUs with limited VRAM. The axolotl config file for this experiment is given below." },
            {
                "code": `base_model: mistralai/Mixtral-8x7B-v0.1
            model_type: AutoModelForCausalLM
            tokenizer_type: LlamaTokenizer
            trust_remote_code: true
            
            load_in_8bit: false
            load_in_4bit: true
            strict: false
            datasets:
              - path: cognitivecomputations/WizardLM_evol_instruct_V2_196k_unfiltered_merged_split
                type: sharegpt
                conversation: chatml
            dataset_prepared_path: last_run_prepared
            val_set_size: 0.02
            output_dir: ./qlora-out
            model_config:
              output_router_logits: true
            adapter: qlora
            lora_model_dir:
            sequence_len: 1024
            sample_packing: false
            pad_to_sequence_len: false
            lora_r: 16
            lora_alpha: 16
            lora_dropout: 0.05
            lora_target_linear: true
            lora_fan_in_fan_out:
            wandb_project: fsdp
            wandb_entity:
            wandb_watch:
            wandb_name:
            wandb_log_model:
            gradient_accumulation_steps: 4
            micro_batch_size: 2
            num_epochs: 1
            max_steps: 100
            optimizer: paged_adamw_8bit
            lr_scheduler: cosine
            learning_rate: 0.0002
            train_on_inputs: false
            group_by_length: false
            bf16: auto
            fp16:
            tf32: false
            gradient_checkpointing: true
            early_stopping_patience:
            resume_from_checkpoint:
            local_rank:
            logging_steps: 1
            xformers_attention:
            flash_attention: true
            loss_watchdog_threshold: 5.0
            loss_watchdog_patience: 3
            warmup_steps: 10
            evals_per_epoch: 4
            eval_table_size:
            eval_max_new_tokens: 128
            saves_per_epoch: 1
            debug:
            weight_decay: 0.0
            fsdp:
              - full_shard
            fsdp_config:
              fsdp_transformer_layer_cls_to_wrap: MixtralSparseMoeBlock
            special_tokens:`},
            { "heading": "MegaQwen-120B", "description": "I also tried out the interleaving technique on Qwen-70B to create MegaQwen-120B inspired by Venus-120B. Since a 120B model would have also required an insane amount of VRAM for training, I learned this fact the hard way that you have to fine-tune your 70B model before and then interleave it, thereby bypassing the memory constraints. I tried out interleaving first and then fine-tuning the massive 120B model which ended up with OOM. My prior logic was that a model 120B param requires 240GB VRAM (4bit -> 68GB), I threw 4x A100 i.e. 320GB VRAM and this should work. But, that didn’t work out. The main reason was that Zero-2 has copies of entire model parameters on each GPU and Pytorch was somehow taking 12GB leading to OOM on 80GB VRAM A100, so throwing in A100s didn’t make any difference. Also, Zero-3 (model params sharding) was not an option due to the errors that it was presenting. I noted these OOM errors and will try to keep track of the memory constraints more vigilantly in the future. The axolotl config for this experiment is available below." },
            {
                "code": `base_model: abideen/Qwen-120B
            model_type: Qwen2ForCausalLM
            tokenizer_type: Qwen2Tokenizer
            load_in_8bit: false
            load_in_4bit: true
            strict: false
            
            datasets:
              - path: abacusai/SystemChat
                type: sharegpt
            dataset_prepared_path:
            val_set_size: 0
            output_dir: /workspace/Qwen-120b-systemchat
            resume_from_checkpoint:
            hf_use_auth_token:
            adapter:  qlora
            lora_model_dir:
            sequence_len: 2048
            sample_packing: true
            pad_to_sequence_len: true
            lora_r: 16
            lora_alpha: 16
            lora_dropout: 0.05
            lora_target_modules:
            lora_target_linear: true
            lora_fan_in_fan_out:
            lora_modules_to_save:
              - embed_tokens
              - lm_head
            wandb_project: Qwen-Systemchat-multiGPU
            wandb_entity: 
            wandb_watch:
            wandb_run_id: 
            wandb_log_model:
            gradient_accumulation_steps: 1
            micro_batch_size: 1
            num_epochs: 1
            optimizer: paged_adamw_8bit
            lr_scheduler: cosine
            learning_rate: 0.0002
            train_on_inputs:
            group_by_length: false
            bf16: true
            fp16: false
            tf32: false
            gradient_checkpointing: true
            early_stopping_patience:
            local_rank:
            logging_steps: 1
            xformers_attention:
            flash_attention: true
            warmup_steps: 100
            eval_steps: 
            save_steps: 2000
            save_total_limit: 2
            eval_sample_packing:
            debug:
            deepspeed:
            weight_decay: 0.05
            fsdp:
            fsdp_config:
            special_tokens:
              eos_token: "<|im_end|>"
            tokens:
              - "<|im_start|>"
            trust_remote_code: true`},
            { "space": "MegaQwen-120B: https://huggingface.co/abideen/MegaQwen-120B" },
            { "heading": "Conclusion", "description": "All in all, it was a good experience for me to try out multi-GPU training on the best available open-source models. I tried to work my way through different errors, but some remained unresolved. Will try to solve them in future experiments." },
            { "tags": ["AI", "LLM", "NLP", "Machine Learning", "Deep Learning", "Queryloop"] },
        ]
    },
    {
        id: "queryloop0005",
        title: "Everything you need to know about Google’s new Gemma 7B and 2B Models",
        author: "Queryloop",
        date: "02/29/2024",
        excerpt: "Also releasing Gemma-7B-Openhermes and Gemma-2B-Openhermes",
        content: "Google has been in the LLM space for quite some time now, yet Gemma remains their first open LLM. The release of Gemma has stirred the community and everyone is excited to try it out. Like everyone, I am no exception...",
        parts: [
            { "heading": "Introduction", "description": "Google has been in the LLM space for quite some time now, yet Gemma remains their first open LLM. The release of Gemma has stirred the community and everyone is excited to try it out. Like everyone, I am no exception. But, how good this model really is? To answer this question, I have compared the performance of different variants of the Gemma family and stated their results, and I have also released 2 more variants of Gemma-it. Before moving on, let’s introduce Gemma first.", "space": "The Gemma offers a collection of lightweight open models from Google that are created from the same research and technology as that used in the development of the Gemini model. They are text-to-text, decoder-only language models, available in English. The main highlight of this family of models is that their weights are open, and they also offer pre-trained and instruction-tuned variants of the model. Gemma models can be used in different natural language processing problems such as question answering, summarization, and reasoning. Due to their relatively small sizes, the possibility of using them in environments with restricted resources such as laptops, desktops, or one’s cloud infrastructure, no longer looks unrealistic and supports an innovation for everybody. The four variants released are:" },
            { "list": ["Gemma-2B", "Gemma-2B-it", "Gemma-7B", "Gemma-7B-it"] },
            { "space": "The difference between “it” aka “Instruction Tuned” and the base model is that the “it” variants are better for chat purposes since they have been fine-tuned to better understand the instructions and generate better answers while the base variants are those that have not undergone under any sort of fine-tuning. They can still generate answers but not as good as the “it” one." },
            { "heading": "Performance", "description": "Now, coming towards the performance side. The Gemma performs well on the Open LLM leaderboard. But if we compare Gemma-2b (2.51 B) with PHI-2 (2.7 B) on the same benchmarks, PHI-2 easily beats Gemma-2b." },
            { img: gemma1 },
            { img: gemma2 },
            { "space": "The results of PHI-2 are almost comparable to Gemma-7B. The numbers are even worse on the Nous and EQ benchmarks. Gemma-2b(-it) (2.51B param) severely underperforms phi-2 (2.78B param) on Nous’ benchmark suite. Quite surprising that both AGIEval and Bigbench are particularly related with human evaluation.", img: gemma3 },
            { "space": "To view and analyze the results across various benchmarks, visit the model card for Google’s Gemma. It is tempting to say the least that Gemma might have been overfitting the test benchmarks." },
            { "heading": "Release of Gemma-7B-Openhermes and Gemma-2B-Openhermes", "description": "Gemma-7b-openhermes is a variant of the Gemma 7B language model, which has been further fine-tuned on the OpenHermes-2.5 preference dataset using QLoRA.", "list": ["google/gemma-7b-it", "mlabonne/chatml-OpenHermes2.5-dpo-binarized-alpha"] },
            { "space": "Similarly Gemma-2b-openhermes has been finetuned.", "list": ["google/gemma-2b-it", "mlabonne/chatml-OpenHermes2.5-dpo-binarized-alpha"] },
            { "space": "Since Gemma “base” and “it” models did not show satisfactory performance, I tried to steer the model, releasing the DPO’d variant of the models on the OpenHermes-2.5 preference dataset. Both Gemma-2b and Gemma-7b are available to try out in openhermes variant. As compared to the “it” variant of the model, the model improved a bit but some of the results were still lower than the original “it” variants. To give you a little context, on the AGIEVAL Gemma-2B-Openhermes showed an improvement from 23.76 to 23.80, and 29.41 to 44.75 on BIGBENCH. But on the GPT4ALL and TRUTHFULQA, the model severely underperforms." },
            { "img": gemma4 },
            { "img": gemma5 },
            { "heading": "What is wrong with Gemma?", "description": "Gemma's hype-train did not last for long due to excessive RLHF. Specifically, the instruction-tuned version did not show great results. The alignment/RLHF process involves adjusting the model’s outputs so that they align with certain ethical guidelines and values, which can sometimes result in overly cautious or conservative responses. While this approach is important for ensuring responsible AI practices, it appears that in the case of the instruction-tuned variants of the Gemma family, the alignment may have been too restrictive, leading to models that lacked the necessary flexibility to provide informative and engaging answers. There were a lot of X’s to fill in its response like “I can’t answer X”. The main reason behind this behavior was the excessive alignment that this model (it) has undergone. These restrictions were reflected in the DPO’d variant as well. The final verdict, in my opinion, is that the instruction-tuned variants are stripped of the basic facts and are censored more than what is required, not leaving an impact on the community at the moment." },
            { "heading": "Next step?", "description": "So, the future course of action will be to fine-tune the base models of the Gemma family and check to see if the model has shown any sort of improvement over the instruction-tuned variant. The truth is if we want to get something useful out of Gemma we need to start with the foundational model and train it with a lot of basic information about reality.", "space": "In the days to come, Google might realize that excessive RLHF is not useful as it is driving away the enthusiasts and sooner or later, they need to tackle the issue of denying the users their response." },
            { "heading": "Conclusion", "description": "Overall, while the idea of instruction-tuned models holds potential, the current implementations appear to have fallen short of expectations, particularly when it comes to providing SOTA LM to the wider community. Further research and development will likely be needed to address these challenges and create more effective and engaging instruction-tuned models in the future." },
            { "tags": ["AI", "Google", "Gemma Model", "Machine Learning", "Deep learning", "Queryloop"] },
        ]
    },
    {
        id: "queryloop0006",
        title: "Best SLM? Stable LM vs Tiny LLama vs Mini CPM vs Qwen 1.5 | War of SLMs",
        author: "Queryloop",
        date: "02/17/2024",
        excerpt: "Benchmarking Emotional intelligence evaluation, Code Generation, Text summarization, and Narrative composition.",
        content: "Small Language Models (SLMs) have been the talk of the town for some time now. Different models are being released almost everyday with the focus to achieve on par results with Large Language Models (LLMs). However, in terms of computational and memory cost, SLMs are already ahead..."
        , parts: [
            { "heading": "Introduction", "description": "Small Language Models (SLMs) have been the talk of the town for some time now. Different models are being released almost everyday with the focus to achieve on par results with Large Language Models (LLMs). However, in terms of computational and memory cost, SLMs are already ahead. For sometime they have been regarded as only the smaller versions of the LLMs but now the conditions have changed. SLMs are getting better and better with each passing day and their results are somewhat comparable with the LLMs. Now the question arises: Which SLM is the best? To answer this question, I compared the performance of these small language models (Stable LM, Tiny LLama, MINI CPM, and QWEN 1.5). It was necessary to submit each model to a number of benchmark tests that focused on different NLP tasks. These tasks comprised emotional intelligence evaluation, code generation, text summarization and narrative composition. By looking at the findings of the evaluation, I found that one model consistently outperformed the other across all tasks, while one consistently performed poorly. The two models were comparable to each other and generated similar responses." },
            { "heading": "Advantages of SLMs", "description": "Before jumping onto the comparison of these SLMs, it is necessary for us to understand what advantages does SLMs have over LLMs. The list is long but some of the most important aspects include:" },
            { "list": ["Lower computational requirements: This is because SLMs are often less resource intensive and demand less memory and computing power than LLMs. This allows for their utilization on devices with limited resources and also in environments where compute resource would otherwise be limited.", "Faster training times: With the lesser number of parameter to optimize during the training, SLMs generally converge faster than LLMs, consequently leading to the quicker and better iterations.", "Cost savings: The cost of small models usually is lower than what you would pay for the larger ones for both training and usage. The licensing fees could be reduced or probably the small models can be deployed and maintained at very minimized costs.", "Deployment on Cutting-Edge Devices: Another area where SLMs are more effective than LLMs is on cutting-edge devices, which have resource-limited hardware and they need the optimized computation to make the interface attractive for users. The devices that can be categorized among such examples include smartphones, wearables, and IoT gadgets."] },
            { "heading": "Testing Conditions", "description": "Prior to conducting comparative analysis of small language models (SLMs), several preconditions were met to ensure consistency and fairness. Specifically, these conditions were:", "list": ["All models be instantiated in a conversational format (Chat Models), capable of engaging in dialogues with human.", "The total parameter count for each SLM must not exceed 2 billion, thereby focusing on truly compact architectures.", "Every model was presented with identical prompts for each task, avoiding any preceding conversation history or context. This approach aimed to minimize bias and ensure that each SLM’s response was solely dependent on the given input."], "space": "Adherence to these conditions would most probably have resulted in unbiased responses from SLMs. Since nothing is perfect, hence the term probably." },
            { "heading": "Comparison", "description": "Now, we are gonna compare these four LLMs:", "list": ["Stable LM-2 1.6 B", "Tiny LlaMA chat 1.1B", "QWEN-1.5 chat 1.8B", "MiniCPM-2B"], "space": "Across different prompts and we will also rate their responses along with reasoning for each response. We will be doing evaluation on the basis of emotional intelligence, code generation, text summarization and narrative composition." },
            { "heading": "Emotional Intelligence Evaluation", "description": "We will be using 3 prompts for emotional intelligence evaluation. These prompts are:" },
            { "heading": "Prompt 1:", "description": "Examine the emotion and sentiment expressed in the following movie review excerpt: “The acting was superb, but the plot was predictable and lackluster.” Determine if the overall impression conveyed by the statement leans more towards being positive, negative, or neutral." },
            { "heading": "Prompt 2:", "description": "Describe two scenarios where understanding customer emotions could significantly contribute to improving business outcomes. Suggest a potential solution involving emotion detection technology for each situation." },
            { "heading": "Prompt 3:", "description": "Based on the weather conditions described below, predict the likely mood of the speaker: “A heavy blanket of clouds smothered the sky, casting an eerie gray pallor over the once vibrant cityscape. Raindrops pattered against windows with rhythmic monotony, creating a somber symphony that echoed the residents’ melancholic spirits.”" },
            { "space": "Few of the screenshots are attached below:", images: [slm1, slm2], "list": ["Stable LM-2 1.6 B: Across all three prompts, the responses generated by Stable LM were given a rating of 9/10. The main reason being that it remained consistent in its response, disected the prompt appropriately and had depth in its answer", "Tiny LlaMA chat 1.1B: The responses generated by this model were ranked 8/10. It provided accurate answers but were over-simplified and depth was missing which is considered important in terms of emotional intelligence", "QWEN-1.5 chat 1.8B: Its responses have been given the same rating as that of Stable LM-2 i.e. 9/10. It provided very descriptive and precise answers and maintained a balanced perspective", "MiniCPM-2B: For the first prompt, the model couldn’t perform well (Rating 7/10) but for the remaining two prompts, the results were on par and were given a rating of 9/10. The reason for low rating of first prompt was vague arguments and the model was not confident in its response."] },
            { "heading": "Narrative Composition/Story Writing", "description": "We have done this evaluation on a single prompt and have ranked the response based on the storyline and details incorporated by each response.", "space": "Some of the screenshots of responses are atttached below.", images: [slm3, slm4] },
            { "heading": "Prompt:", "description": "In a sleepy town where nothing ever happens, ordinary citizens start developing extraordinary powers overnight — an elderly woman gains telekinesis, a schoolboy acquires super strength, and a timid girl suddenly becomes invisible. As everyone grapples with their newfound abilities, tensions rise, fueling fear and prejudice among neighbors. Write a poignant story exploring themes of acceptance, change, and community in this magical setting.", "list": ["Stable LM-2 1.6 B: Rating 9/10. Consistent pacing, good balance of emotion and action, and solid exploration of the themes.", "Tiny LlaMA chat 1.1B: Rating 8/10. Heartwarming portrayal of acceptance, change, and community. Somewhat predictable yet still engaging, with room for improvement in terms of descriptiveness and complexity of subplots.", "QWEN-1.5 chat 1.8B: Rating 6/10. There were disparities in tone. There was no linkage between emotional growth and better community issues.", "MiniCPM-2B: Rating 8/10. It is the conflict resolution, character development, and good theme integration which make the work engaging. The use of subtleness and complexity could help to create the suspense prior to the revelations of the super powers."] },
            { "heading": "Code Generation", "description": "For code generation, we have evaluated the models on 2 prompts", },
            { "heading": "Prompt 1:", "description": "Develop a lightweight microservice written in Go or Rust that resizes incoming JPG images to specified dimensions using OpenCV or any alternative computer vision library. Optimize the solution for minimal latency and memory footprint." },
            { "heading": "Prompt 2: ", "description": "Given a database schema consisting of two tables: “Orders” (OrderID int PRIMARY KEY, CustomerName varchar(50)) and “OrderDetails” (DetailID int PRIMARY KEY, OrderID int, ProductName varchar(50), Quantity int, UnitPrice decimal(18,2)), write an SQL query to retrieve the total revenue for each customer who has placed orders. Format the output as follows: CustomerName, TotalRevenue, where TotalRevenue represents the sum of all products’ prices multiplied by quantities ordered by that customer. Display customers with zero sales too. Sort the final result set alphabetically by customer name.", "space": "Some of the screenshots of the responses are attached below" },
            { "images": [slm5, slm6], "list": ["Stable LM-2 1.6 B: Rating 9/10. Stable LM mostly generated the right code but on some instances left the space empty for the user to fill where the main logic should be written.", "Tiny LlaMA chat 1.1B: Rating 6.5/10. It could not perform well on both the coding tasks specifically in SQL query.", "QWEN-1.5 chat 1.8B: Rating 7/10. This model generated the worst response for SQL query. But performed relatively well for the Go microservice.", "MiniCPM-2B: Rating 8.5/10. Performed well on both prompts. Generated a slightly better response for Go Microservice prompt."] },
            { "heading": "Text Summarization", "description": "For this task, I have picked up a random article from the web having approximately 4500 tokens. The article is about Ethical Assessment of Implantable Brain Chips", "space": "Some screenshots are attached below.", "images": [slm7, slm8], "list": ["Stable LM-2 1.6 B: Rating 7/10.It touches on almost all the important points of the original text, but misses some nuances about the potential societal implications of the technology.", "Tiny LlaMA chat 1.1B: Rating 8/10. It covers all the relevant topics and adds valuable context to some of the issues raised in the original text.", "QWEN-1.5 chat 1.8B: Rating 0/10. No text generated due to fixed context length (2048) of the model.", "MiniCPM-2B: Rating 9/10. The response generated by this model is the strongest, as it comprehensively addresses the topic and offers insightful commentary on the ethical and societal implications of implantable brain chips."] },
            { "heading": "Conclusion", "description": "After the comparative evaluation and performance assessment of Stable LM-2, Tiny LLama, MINI CPM and QWEN 1.5, it was found that Stable LM-2 outperformed others. Its emotional intelligence, coding exercises, text summarization, and story writing abilities demonstrated its competence.", "space": "In the other end of the game play, Tiny llama had been trailing behind its competitors, being beaten on almost each task. Through splashes of brilliance, it was able to keep up with the rest of the contenders, earning its place as the least efficient model." },
            { "space": "As for MINI CPM and QWEN 1.5, the study indicated that their performances were quite the same through most of the tests. Besides not being able to surpass Stable LM-2, both of them exhibited flair in some areas and therefore could be utilized alongside each other according to the user case requirements or availability of resources." },
            { "tags": ["AI", "Machine Learning", "Deep Learning", "Language Model", "LLM", "Queryloop"] }
        ]
    },
    {
        id: "queryloop0007",
        title: "The Future of Database Queries: Evaluating Text-to-SQL and Text-to-NoSQL with AI",
        author: "Queryloop",
        date: "04/19/2024",
        excerpt: "Text-to-SQL and Text-to-NoSQL with AI",
        content: "In the wake of ChatGPT and other large language models (LLMs) gaining prominence, the fascination with Retrieval Augmented Generation (RAG) — essentially conversing directly with your data — has skyrocketed.",
        parts: [
            { "heading": "Introduction", "description": "In the wake of ChatGPT and other large language models (LLMs) gaining prominence, the fascination with Retrieval Augmented Generation (RAG) — essentially conversing directly with your data — has skyrocketed. While the concept of querying databases using natural language is captivating, the practical implementation of such RAG applications poses significant challenges.", "space": "This article delves into the exciting realm of using LLMs to transform natural language into SQL and NoSQL queries. Imagine the ease of fetching and filtering data just by typing out your thoughts. However, as straightforward as this may sound, the process is fraught with complexities. LLMs, for all their intelligence, are still prone to errors and can sometimes produce inaccurate or fabricated information." },
            { "space": "Despite these challenges, the focus of our discussion will not be on the pitfalls alone but rather on comparing two powerful methods of data interaction: Text-to-SQL versus Text-to-NoSQL. Which approach is more effective? Which offers more accurate results or lower latency? And, when dealing with simpler data formats like CSV or Excel files, which method should you prefer?" },
            { "space": "Join me as we explore these questions, offering insights and perhaps even answers on how best to leverage these groundbreaking technologies in real-world applications." },
            { "heading": "Text-to-SQL", "description": "Text-to-SQL process involves providing an LLM with the schema of a database table, sometimes accompanied by an example row, to contextualize the data structure. This setup allows the model to understand what kind of information is stored and how it is organized.", "space": "When a user poses a query in natural language, the LLM utilizes this contextual information to generate a corresponding SQL query. This query isn’t just a direct transformation of text; it’s an intelligent creation that considers the user’s intent and the database’s architecture." },
            { "space": "Here’s how it typically works in a practical setting:", "list": ["1. Input Preparation: The user’s natural language query is received.", "2. Query Generation: The LLM processes this input along with the provided schema and potentially an example row, to generate a SQL query.", "3. Execution: The SQL query is executed within a database environment, such as SQLite in Python.", "4. Output: The result of the query is returned to the user, completing the cycle from natural language to database response."] },
            {
                "space": "Below is a Python function that illustrates this process in action:", "code": `def read_sql_query(user_input: str, table_schema: str,database: str):
            """Generates an SQL query based on user input and retrieves data from an SQLite database.
            
            :param user_input: User's question or input that guides the query generation.
            :type user_input: str
            :param table_schema: A string representation of the table schema.
            :type table_schema: str
            :param database: The path to the SQLite database file.
            :type database: str
            :return: A list of dictionaries containing data retrieved from the SQLite database, or an error message.
            :rtype: list[dict] or str
            """
            attempts = 0
            while attempts < 3:
                        try:
        
                                user_message=user_input
        
                                system_temp = f"""
                                You are a SQL expert. Create a syntactically correct SQL query for the user question:
        
                                This is the table schema : "{table_schema}"
                                Return the SQL query in JSON.
                                output should be like  "SQL_query": "json_object"
                                Unless the user specifies in the question a specific number of examples to obtain, limit your query to at most 10 results using the LIMIT stage in SQL query
                                Always use the field names as they appear in the table. Be careful not to query for fields that do not exist.
                                ###{user_message}###
                                """
        
                        
                                
                                client = openai.OpenAI(
                                                api_key=os.environ.get("OPENAI_API_KEY"),
                                        )
                                response = client.chat.completions.create(
                                        model="gpt-3.5-turbo-1106",
                                        temperature=0,
                                        response_format={"type": "json_object"},
                                        messages=[
                                        {"role": "system", "content": system_temp}
                                                ]
                                        )
        
                                output_str = response.choices[0].message.content
                                        
                                        # Attempt to parse the output string as JSON and check for the 'pipeline' key
                                output_json = json.loads(output_str)
                                        
                                print(output_json)
                                        # MongoDB connection parameters
                                host = "localhost"
                                port = 27017
                                database_name = database
        
                                        # Connect to MongoDB
                                connection = sqlite3.connect(database)
                                cursor = connection.cursor()
        
                                # Execute the generated SQL query
                                cursor.execute(output_json['SQL_query'])
        
                                # Fetch all rows from the query result
                                rows = cursor.fetchall()
        
                                # Convert rows to a list of dictionaries using column names as keys
                                columns = [description[0] for description in cursor.description]
                                data = [dict(zip(columns, row)) for row in rows]
        
                                # Close the database connection
                                connection.close()
        
                                return data
                        
                        except sqlite3.OperationalError as e:
                                print("Operational error, retrying...")
                                attempts += 1
                        except Exception as e:
                        # Handle other exceptions here if necessary
                                connection.close()
                                return str(e)
                        finally:
                                if connection:
                                        connection.close()
                        
            return "Functional error after multiple retries."`},
            { "space": "(Note: While there are several methods to extract database schemas, we’ll skip that part for brevity in this discussion.)" },
            { "space": "Now, let’s demonstrate the practicality of Text-to-SQL using a common dataset found in many examples, Movies.csv. To test the functionality, we'll pose a straightforward query to our system." },
            { "space": "Here’s the function call that tests the system:", "code": `read_sql_query(user_input="What is the top rated movie?", table_schema=schema_dict, database='database.db')` },
            { "heading": "SQL Query Generated by LLM:", "code": `{'SQL_query': 'SELECT * FROM Movies ORDER BY "Movie Rating" DESC LIMIT 1'}` },
            {
                "heading": "Output:", "code": `[{
                'Unnamed: 0': 0,
                'Movie Name': 'The Shawshank Redemption',
                'Year of Release': '(1994)',
                'Watch Time': '142 min',
                'Movie Rating': 9.3,
                'Meatscore of movie': '81        ',
                'Votes': '34,709',
                'Gross': '$28.34M',
                'Description': 'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.'
              }]`},
            { "space": "As illustrated, the query successfully identifies “The Shawshank Redemption” as the top-rated movie according to the dataset." },
            { "heading": "Text-to-NoSQL", "description": "Moving from SQL to NoSQL databases, the approach remains fundamentally similar, yet it adapts to the distinct structure and capabilities of NoSQL systems. In the case of MongoDB, one of the most powerful features is its aggregation pipeline, which allows for complex data processing and transformation directly within the database.", "space": "Similar to the Text-to-SQL process, we provide the LLM with an understanding of the database’s structure. However, instead of generating SQL queries, the LLM now crafts MongoDB aggregation pipelines based on the user’s natural language queries. These pipelines are sequences of data processing stages that transform, filter, and aggregate data efficiently." },
            {
                "heading": "The following is the function for Text-to-NoSQL:", "code": `def read_mongodb_query(user_input: str, table_schema: str, schema_description: str, database: str, collectionName: str):
            """Generates a MongoDB raw aggregation pipeline based on user input and retrieves data from a MongoDB collection.
            
            :param user_input: User's question or input that guides the query generation.
            :type user_input: str
            :param table_schema: A string representation of the table schema.
            :type table_schema: str
            :param schema_description: A string that describes the schema of the data.
            :type schema_description: str
            :param database: The name of the MongoDB database to query.
            :type database: str
            :param collectionName: The name of the MongoDB collection to query.
            :type collectionName: str
            :return: A list of dictionaries containing data retrieved from the MongoDB collection, or an error message.
            :rtype: list[dict] or str
            """
            max_retries=3
            retries = 0
            user_message=user_input
        
            system_temp = f"""
            You are a MongoDb expert. Create a syntactically correct raw aggregation query for the user question:
        
            This is the table schema : "{table_schema}"
            The following is an the example of one mongodb document: {schema_description}
            Return the raw aggregration pipeline in JSON.
            output should be like  "pipeline": "json_object"
            Unless the user specifies in the question a specific number of examples to obtain, limit your query to at most 10 results using the $limit stage in an aggregation pipeline.
            Always use the field names as they appear in the collection. Be careful not to query for fields that do not exist.
            When dealing with dates, and if the question involves "today", use MongoDB's $currentDate, $dateFromString, or similar operators to work with the current date.
            DO NOT USE $TRIM etc to convert the price in raw aggregation, data is already in double.
            DO NOT to use $lookup
            ###{user_message}###
            """
            while retries < max_retries:
                try:
               
                    
                    client = OpenAI(
                        api_key=os.environ.get("OPENAI_API_KEY"),
                    )
                    response = client.chat.completions.create(
                        model="gpt-3.5-turbo-1106",
                        response_format={"type": "json_object"},
                        messages=[
                            {"role": "system", "content": system_temp}
                        ]
                    )
        
                    output_str = response.choices[0].message.content
                    
                    # Attempt to parse the output string as JSON and check for the 'pipeline' key
                    output_json = json.loads(output_str)
                    if 'pipeline' not in output_json:
                        raise ValueError("Invalid format: 'pipeline' key not found")
        
                    # MongoDB connection parameters
                    host = "localhost"
                    port = 27017
                    database_name = database
        
                    # Connect to MongoDB
                    mongo_client = pymongo.MongoClient(host, port)
                    db = mongo_client[database_name]
                    collection = db[collectionName]
        
                    # Perform the query and retrieve data
                    result = collection.aggregate(output_json['pipeline'])
                    print(f"(TEST QUERY:{output_json['pipeline']})")
                    # Convert the result to a list of dictionaries
                    data = [item for item in result]
        
                    # Close the MongoDB connection
                    mongo_client.close()
        
                    return data
                except (json.JSONDecodeError, ValueError) as e:
                    print(f"Error encountered: {e}. Retrying...")
                    retries += 1
                    if retries == max_retries:
                        # Close the MongoDB connection if open and retries are exhausted
                        mongo_client.close()
                        return "Error: Could not figure the query out after maximum retries."`},
            { "space": "Here we’ll be using property_rentals.csv, another common csv dataset available.", "code": `read_mongodb_query(user_input="what is the average of the prices", table_schema=schema_dict, schema_description=example_schema, database=database, collectionName=property_collection)` },
            { "space": "Aggregation pipeline generated by LLM:", "code": `{'$group': {'_id': None, 'average_price': {'$avg': {'$toDouble': '$price'}}}}, {'$limit': 10}` },
            { "space": "Output:", "code": `{'_id': None, 'average_price': 227.61161116111612}` },
            { "space": "This example illustrates how the LLM formulates a MongoDB aggregation pipeline." },
            { "heading": "Comparing Text-to-SQL and Text-to-NoSQL: Performance, Accuracy, and Practicality", "description": "Having explored both Text-to-SQL and Text-to-NoSQL techniques, the question arises: which approach is better? The answer varies depending on several factors, including performance, accuracy, and the specific use case." },
            { "heading": "Latency and Performance", "description": "In terms of response time, Text-to-SQL generally demonstrates faster query generation compared to Text-to-NoSQL using MongoDB’s aggregation pipelines. This difference primarily stems from the inherent complexity of NoSQL operations. Even basic queries in MongoDB can require multiple stages in an aggregation pipeline, making them inherently more complex and slower to generate than equivalent SQL queries." },
            { "heading": "Accuracy and Reliability", "description": "Accuracy is another critical factor where Text-to-SQL tends to outshine Text-to-NoSQL. SQL queries are often more straightforward and less prone to errors, particularly in scenarios involving complex calculations or multiple filtering steps. LLMs handling NoSQL queries can sometimes “hallucinate” or generate inaccurate data manipulations, especially under complex querying conditions. This makes Text-to-SQL more reliable for precise data retrieval and operations that involve complex statistical calculations." },
            { "heading": "Practical Considerations", "description": "It’s essential to consider the database schema complexity. For SQL databases with numerous interconnected tables, fully injecting a complex schema into an LLM can be impractical due to constraints like token limits in the model. This can lead to failures or incomplete schema understanding, which impacts the quality of the generated queries.", "space": "On the other hand, when using formats like CSV for testing both SQL and NoSQL methods, the simplicity of the data format does not fully exploit the strengths and capabilities of either approach. While simpler, this does not reflect the performance and utility of these methods in handling more complex, real-world database structures." },
            { "heading": "Use Case Specificity", "description": "The choice between Text-to-SQL and Text-to-NoSQL ultimately depends on the specific use case:", "space": "Text-to-SQL is preferable for environments where precise and accurate retrieval of structured data is critical, especially when dealing with statistical data or when the database schema is highly relational. Text-to-NoSQL may be more suitable in environments where the data is more hierarchical or loosely structured, or where the database operations require more flexible and dynamic data manipulation capabilities." },
            { "heading": "Conclusion", "description": "Both Text-to-SQL and Text-to-NoSQL have their respective strengths and weaknesses. The decision on which to use should be guided by the specific requirements of the application, including the need for speed, accuracy, and the complexity of the data and queries involved. As technology and LLM capabilities evolve, these distinctions may blur, but for now, a careful evaluation of each method’s merits and limitations is essential for optimal implementation." },
            { "tags": ["LLM", "Text to SQL", "Text to NoSql", "Large Language Models", "OpenAI", "AI", "Queryloop"] },
        ]
    },
    {
        id: "queryloop0008",
        title: "Explanation of Karpathy's Micrograd",
        author: "Queryloop",
        date: "05/23/2024",
        excerpt: "Text-to-SQL and Text-to-NoSQL with AI",
        content: "Zero to HeroA course by Andrej Karpathy focuses on building neural networks from scratch, starting with the basics of backpropagation and advancing to modern deep neural networks like GPT. The course emphasizes language models as an ideal entry point into deep learning, with transferable knowledge applicable to other areas like computer vision. Prerequisites include solid programming skills (Python) and introductory-level math (e.g., derivatives, Gaussian).",
        parts: [
            { "heading": "Introduction", "description": "Neural Networks: Zero to HeroA course by Andrej Karpathy focuses on building neural networks from scratch, starting with the basics of backpropagation and advancing to modern deep neural networks like GPT. The course emphasizes language models as an ideal entry point into deep learning, with transferable knowledge applicable to other areas like computer vision. Prerequisites include solid programming skills (Python) and introductory-level math (e.g., derivatives, Gaussian)." },
            { "space": "The course includes a detailed syllabus:" },
            { "heading": "Intro to Neural Networks and Backpropagation (2h 25m):", "description": "Step-by-step explanation of backpropagation and training neural networks, assuming basic Python knowledge and high school-level calculus." },
            { "heading": "Intro to Language Modeling (1h 57m):", "description": "Implementation of a bigram character-level language model, introducing torch.Tensor and language modeling framework (model training, sampling, loss evaluation)." },
            { "heading": "Building makemore Part 2: MLP (1h 15m):", "description": "Implementation of a multilayer perceptron (MLP) character-level language model, covering basics of machine learning (model training, hyperparameters, evaluation, etc.)." },
            { "heading": "Building makemore Part 3: Activations & Gradients, BatchNorm (1h 55m):", "description": "Examination of MLP internals, training challenges, and introduction of Batch Normalization for easier training of deep neural nets." },
            { "heading": "Building makemore Part 4: Becoming a Backprop Ninja (1h 55m):", "description": "Manual backpropagation through a 2-layer MLP, building strong intuitive understanding of gradient flow and neural net optimization." },
            { "heading": "Building makemore Part 5: Building a WaveNet (56m): ", "description": "Transformation of a 2-layer MLP into a deeper convolutional neural network architecture similar to WaveNet (2016), exploring torch.nn and typical deep learning development processes." },
            { "heading": "Let's build GPT: from scratch, in code (1h 56m):", "description": "Construction of a Generatively Pretrained Transformer (GPT), following the “Attention is All You Need” paper and OpenAI’s GPT-2/GPT-3, with connections to ChatGPT and GitHub Copilot." },
            { "heading": "Let's build the GPT Tokenizer (2h 13m): ", "description": "Building the Tokenizer used in the GPT series, discussing its role in LLMs, training algorithms, and issues related to tokenization." },
            { "space": "" },
            { "description": "The course provides a comprehensive and hands-on approach to understanding and building neural networks, making it accessible for those with the necessary prerequisites. For collaborative learning, participants are encouraged to join the Discord channel." },
            { "space": "" },
            { "description": "In this article, I will discuss Microgard developed by Andrej Karpathy. The link to GitHub is given below." },
            { "link": "(https://github.com/karpathy/micrograd/tree/master/micrograd)." },
            { "space": "" },
            { "description": "Karpathy has explained it in a great way in the first lesson of the course Neural Networks: Zero to Hero. Link to the video lecture is given below." },
            { "link": "https://www.youtube.com/watch?v=VMj-3S1tku0." },
            { "space": "" },
            { "description": "Karpathy has discussed all concepts in a great manner but I will discuss the mathematical concepts and object-oriented programming concepts that I think should explained more explicitly for a novice." },
            { "space": "We can describe the Micrograd engine using these key points." },
            { "list": ["Backpropagation: The engine implements backpropagation.", "Dynamically Built DAG: The computations are represented as a Directed Acyclic Graph (DAG) that is built dynamically.", "Small Neural Networks Library: On top of the autograd engine, there is a small neural network library that mimics the API of PyTorch.", "Scalar Operations: The DAG operates on scalar values, meaning each computation is broken down into the smallest possible operations (additions and multiplications). This granularity helps in understanding how neural networks operate at a fundamental level.", "Educational Purpose: The simplicity and small size of the code make it ideal for learning and teaching purposes. Despite its simplicity, it is powerful enough to construct and train deep neural networks for tasks like binary classification."] },
            { "space": "The Micrograd repository consists of two .py files: engine.py and nn.py. I will start with engine.py because it implements backpropagation. First, I will briefly discuss the code and then explain the chain rule and how it is implemented in the code. The code for the engine.py file is provided below." },
            {
                "code": `class Value:
    """ stores a single scalar value and its gradient """

    def __init__(self, data, _children=(), _op=''):
        self.data = data
        self.grad = 0
        self._backward = lambda: None
        self._prev = set(_children)
        self._op = _op

    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, (self, other), '+')

        def _backward():
            self.grad += out.grad
            other.grad += out.grad
        out._backward = _backward

        return out

    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), '*')

        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad
        out._backward = _backward

        return out

    def __pow__(self, other):
        assert isinstance(other, (int, float)), "only supporting int/float powers for now"
        out = Value(self.data**other, (self,), f'**{other}')

        def _backward():
            self.grad += (other * self.data**(other-1)) * out.grad
        out._backward = _backward

        return out

    def relu(self):
        out = Value(0 if self.data < 0 else self.data, (self,), 'ReLU')

        def _backward():
            self.grad += (out.data > 0) * out.grad
        out._backward = _backward

        return out

    def backward(self):
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)

        self.grad = 1
        for v in reversed(topo):
            v._backward()

    def __neg__(self):
        return self * -1

    def __radd__(self, other):
        return self + other

    def __sub__(self, other):
        return self + (-other)

    def __rsub__(self, other):
        return other + (-self)

    def __rmul__(self, other):
        return self * other

    def __truediv__(self, other):
        return self * other**-1

    def __rtruediv__(self, other):
        return other * self**-1

    def __repr__(self):
        return f"Value(data={self.data}, grad={self.grad})"`},
            { "space": "" },
            { "description": "The above code defines the Value class, which is the core of the automatic differentiation engine. It allows for the creation of a computational graph and supports backpropagation to compute gradients." },
            { "heading": "Chain Rule in the Context of Forward and Backward Passes" },
            { "description": "The chain rule is a fundamental concept in calculus that is essential for understanding and implementing backpropagation in neural networks. It allows us to compute the derivative of composite functions, enabling the efficient calculation of gradients for optimization. In this explanation, we will explore the chain rule using both the forward and backward passes, and apply it to a custom Value class designed for automatic differentiation." },
            { "heading": "Understanding the Chain Rule in Multivariable Calculus", "description": "The chain rule is a fundamental concept in calculus used to find the derivative of composite functions. When dealing with functions of multiple variables, the chain rule helps us understand how a change in one variable affects another through a sequence of dependencies." },
            { "heading": "The General Chain Rule", "images": [andr1] },
            { "space": "Since functions in neural networks are scalar-valued, we need to discuss the chain rule specifically for scalar-valued functions." },
            { "heading": "Scalar-Valued Functions" },
            { "images": [andr2] },
            { "space": "If you look at our __mul__ and __add__ functions, when we add or multiply two Value objects, we are creating a new function that is a composition of the two functions involved in the addition or multiplication. Here is an example of a chain rule involving a function that is a composite of two functions that are adding and multiplying with each other." },
            { "heading": "Addition" },
            { "images": [andr3] },
            {
                "code": `def __add__(self, other):
  other = other if isinstance(other, Value) else Value(other)
  out = Value(self.data + other.data, (self, other), '+')

 def _backward():
      self.grad += out.grad
      other.grad += out.grad
  out._backward = _backward
  return out`},
            { "space": "In the above examples of the chain rule u and v are parent functions or parent nodes and f is the children function or children node." },
            { "description": "In the above code, the _backward() function calculates the derivative with respect to self and other. out is a composite of self and other, or out is a child of self and other. Since self and other are being added, the arithmetic operation is addition. According to the chain rule, the partial derivative will be 1 for all parent functions or variables when the arithmetic operation is addition. Therefore, the _backward() function is:" },
            {
                "code": `def _backward():
      self.grad += 1 * out.grad
      other.grad += 1 * out.grad`},
            { "space": "As the impact of multiplication by 1 is negligible, it is not explicitly included in the original code snippet. The question then arises: why do we multiply the partial derivatives of the parents (self and other) by the gradient of the child node (out.grad)? The answer lies in the chain rule. According to the chain rule, if a child node depends on its parent nodes, the effect of any change in the child node propagates to its parents. We calculate this effect by multiplying the gradient of the child by the gradients of the parents." },
            { "space": "Now, why do we increment the gradients of the parents (self and other)? This is because if a parent node has more than one child, the effect of changes related to each child should be reflected in the gradient of the parent. According to the chain rule, this is achieved by adding each gradient. Thus, the gradients of self and other are incremented to accumulate the effects of all their children." },
            { "heading": "Multiplication" },
            { "images": [andr4] },
            {
                "space": `def __mul__(self, other):
  other = other if isinstance(other, Value) else Value(other)
  out = Value(self.data * other.data, (self, other), '*')

  def _backward():
      self.grad += other.data * out.grad
      other.grad += self.data * out.grad
  out._backward = _backward

  return out`},
            { "space": "In the above code, the _backward() function calculates the derivative with respect to self and other. out is a composite of self and, or out is a child of self and other. Since self and other are being multiplied, the arithmetic operation is multiplication. According to the chain rule, the partial derivative of self will be other, and the partial derivative of other will be self." },
            { "space": "The reason why we multiply the partial derivatives of self and other by out.grad and why we increment self.grad and other.grad is explained above." },
            { "heading": "Topological Sort:" },
            { "description": "Topological sort is a linear ordering of vertices in a Directed Acyclic Graph (DAG) such that for every directed edge u→v, vertex u comes before vertex v. This sorting is crucial in scenarios where certain tasks must be performed before others, such as task scheduling, course prerequisite planning, and, in our context, the sequence of computations in neural networks for backpropagation." },
            { "heading": "Role of Topological Sort in Backpropagation", "description": "Backpropagation is the process of computing the gradient of the loss function with respect to each parameter in the neural network. It involves moving backward through the computational graph of the network, starting from the output and working toward the input. Topological sort ensures that each node (representing a computation) is processed only after all its dependencies have been handled. This is essential because the gradient of each node depends on the gradients of its children." },
            { "heading": "Example Workflow" },
            {
                "code": `a = 2
b = 3
c = a * b  # c = 6
d = c + b  # d = 9
e = d + a  # e = 11`},
            { "description": "The topological graph of the above equations is given below." },
            { "images": [andr5] },
            { "description": "The topological sort of the above graph would be: [a, b, c, d, e]. Because the backpropagation works in the opposite direction that is why will need topological sort in reverse order. The code below implements topological sort using the BFS method." },
            {
                "code": `def backward(self):
    topo = []
    visited = set()

    def build_topo(v):
        if v not in visited:
            visited.add(v)
            for child in v._prev:
                build_topo(child)
            topo.append(v)
    build_topo(self)

    self.grad = 1
    for v in reversed(topo):
        v._backward()`},
            { "heading": "Explanation", "description": "The backward function uses Depth-First Search (DFS) to perform the topological sort that is necessary for backpropagation." },
            { "heading": "Initialization:", "space": "visited is a set used to keep track of visited nodes to avoid processing the same node multiple times.", "description": "topo is an empty list that will store the nodes in topologically sorted order." },
            { "heading": "build_topo Function", "list": ["This is a recursive function that performs a depth-first traversal of the graph.", "If a node v has not been visited, it is marked as visited.", "The function recursively visits all the nodes in v._prev (i.e., the parent nodes of v), exploring as far down each branch as possible before backtracking.", "After visiting all its parent nodes, v is appended to the topo list."] },
            { "heading": "Topological Sort", description: "The function build_topo(self) initiates the depth-first search from the current node (self), ensuring that all nodes influencing self are visited and ordered correctly in topo." },
            { "heading": "Backward Pass", "list": ["The gradient of the final output node (self) is initialized to 1.", "The nodes are processed in reverse topological order, starting from the output node and moving backward through the graph.", "The _backward method for each child node is called to propagate its gradient to its parents according to the chain rule."] },
            { "description": "I have explained the important functions related to backpropagation. Now, I will briefly explain the other functions of the Value class. But first I will explain double underscore (dunder) functions in python." },
            { "heading": "Double Underscore Functions in Python" },
            { "description": "Double underscore functions in Python, also known as “dunder” methods (short for “double underscore”), are special methods that have double underscores before and after their names (e.g., __init__, __add__). These methods are part of Python's data model and are used to define the behavior of objects for built-in operations. Functions __add__ and __mul__ which we have explained above are examples of “dunder” functions in these functions we are defining the behavior of addition and multiplication for Value class.Let's move toward other functions of Value class." },
            {
                "heading": "Power Function (__pow__)", "code": `def __pow__(self, other):
    assert isinstance(other, (int, float)), "only supporting int/float powers for now"
    out = Value(self.data**other, (self,), f'**{other}')

    def _backward():
        self.grad += (other * self.data**(other-1)) * out.grad
    out._backward = _backward

    return out`},
            { "heading": "Explanation:", "description": "" },
            { "heading": "Purpose", "description": "This function allows a Value object to be raised to a power (exponentiation)." },
            { "heading": "Parameters", "list": ["self: The current Value object.", "other: The exponent, which must be an integer or float."] },
            { "heading": "Forward Pass", "list": ["The data attribute of self is raised to the power of other.", "A new Value object, out, is created with the result of the exponentiation. This new object keeps track of self as its parent, and the operation performed (exponentiation) is recorded."] },
            { "heading": "Backward Pass", "list": ["The _backward function calculates the gradient of the exponentiation operation using the chain rule.", "The partial derivative is calculated according to the power rule of the derivative which is given below.", "This derivative is multiplied with out.grad to propagate the gradient backward."] },
            {
                "code": `y = x^n
dy/dx = n*x**(n-1)`},
            { "heading": "Return", "description": "The function returns the new Value object out that represents the result of the exponentiation." },
            {
                "heading": "ReLU Function (relu)", "code": `def relu(self):
    out = Value(0 if self.data < 0 else self.data, (self,), 'ReLU')

    def _backward():
        self.grad += (out.data > 0) * out.grad
    out._backward = _backward

    return out` },
            { "heading": "Explanation:" },
            { "heading": "Purpose", description: "This function applies the ReLU (Rectified Linear Unit) activation function, which is commonly used in neural networks." },
            { "heading": "Mathematical Definition", description: "The ReLU function is defined as: ReLU(x)=max(0,x)", space: "This means:", list: ["If the input x is greater than 0, the output is x.", "If the input x is less than or equal to 0, the output is 0."] },
            { "heading": "Forward Pass", "list": ["The data attribute of self is compared to 0.", "If self.data is less than 0, out.data is set to 0.", "Otherwise, out.data is set to self.data.", "A new Value object, out, is created to store the result of the ReLU operation, keeping track of self as its parent and the operation performed (ReLU)."] },
            { "heading": "Backward Pass", list: ["The _backward function calculates the gradient of the ReLU operation using the chain rule.", "The derivative of ReLU is 1 if self.data is greater than 0, and 0 otherwise.", "This derivative is multiplied by out.grad to propagate the gradient backward."] },
            {
                heading: "Return", list: ["The function returns the new Value object out that represents the result of the ReLU activation."], code: `def __neg__(self):
    return self * -1`},
            {
                list: ["Purpose: This method allows the unary negation operator (-) to be used with Value objects. This function will help us to tackle subtraction as a special case of addition. It is used in __sub__ method of value class.", "Explanation: When -self is called, it returns the result of multiplying the Value object by -1. This effectively negates the value."], code: `def __radd__(self, other):
    return self + other`},
            {
                list: ["Purpose: This method allows the addition operator (+) to be used with Value objects on the right-hand side.", "Explanation: __add__ method which is defined above is forself+ other.When other + self is called and other does not have its own __add__ method that can handle the addition, Python falls back to __radd__. This method simply calls self + other, leveraging the existing __add__ method."], code: `def __sub__(self, other):
    return self + (-other)`},
            {
                list: ["Purpose: This method allows the subtraction operator (-) to be used with Value objects.", "Explanation: When self - other is called, it returns the result of adding self to the negation of other. This is done using the previously defined __neg__ method."], code: `def __rmul__(self, other):
    return self * other`},
            {
                list: ["Purpose: This method allows the multiplication operator (*) to be used with Value objects on the right-hand side.", "Explanation: When other * self is called and other does not have its own __mul__ method that can handle the multiplication, Python falls back to __rmul__. This method simply calls self * other, leveraging the existing __mul__ method."], code: `def __truediv__(self, other):
    return self * other**-1`},
            {
                list: ["Purpose: This method allows the true division operator (/) to be used with Value objects.", "Explanation: When self / other is called, it returns the result of multiplying self by the reciprocal of other. It handles division as a special case of multiplication. This uses the exponentiation operator (**) to raise other to the power of -1."], code: `def __rtruediv__(self, other):
    return other * self**-1`},
            {
                list: ["Purpose: This method allows the true division operator (/) to be used with Value objects on the right-hand side.", "Explanation: When other / self is called and other does not have its own __truediv__ method that can handle the division, Python falls back to __rtruediv__. This method returns the result of multiplying other by the reciprocal of self."], code: `def __repr__(self):
    return f"Value(data={self.data}, grad={self.grad})"`},
            { list: ["Purpose: This method provides a string representation of the Value object.", "Explanation: When repr(self) or print(self) is called, it returns a string that includes the data and grad attributes of the Value object. This is useful for debugging and logging purposes."] },
            { heading: "Why __radd__ and __rmul__ Are Needed", description: "The __radd__ and __rmul__ methods (and other right-hand side methods like __rsub__ and __rtruediv__) are required to handle cases where the left operand does not support the operation with the Value object as the right operand. These methods ensure that the Value object can interact correctly with non-Value objects in binary operations." },
            { heading: "Example Scenario", description: "Consider the addition operation 3 + value_object, where value_object is an instance of the Value class." },
            { heading: "Without __radd__", description: ["Python first tries to call the __add__ method on the integer 3, but the integer class does not know how to handle a Value object.", "This results in a TypeError."] },
            { heading: "With __radd__", list: ["When Python cannot find a suitable __add__ method on the left operand (the integer 3), it looks for an __radd__ method on the right operand (the Value object).", "The __radd__ method on the Value object is called, successfully handling the operation and returning the correct result."] },
            { description: "We have gone through the code of Value class. Now I will explain the whole backpropagation process with an example." },
            { heading: "Example", description: "Let’s consider the equations provided:", list: ["c = a+b", "d = a×b", "f = c+d"] },
            {
                heading: "Forward Pass", code: `a = Value(2.0)
b = Value(3.0)`},
            {
                space: "Perform addition and multiplication:", code: `c = a + b  # c = Value(5.0)
d = a * b  # d = Value(6.0)`},
            { space: "Perform addition to get e:", code: `e = c + d  # e = Value(11.0)` },
            { space: "The computational graph is given below.", images: [andr6] },
            { heading: "Backward Pass", description: "Initialize Gradient for e:", space: "The gradient for e with respect to e will be 1.", code: `e.grad = 1` },
            { space: "Process Each Node in Reverse Topological Order", code: `e.backward()` },
            { description: "The backward method computes the gradients in the correct order:" },
            { heading: "For e:", description: "e is the composite function of c and d. The arithmetic operation between parent nodes is addition. So the partial derivative with respect to parent nodes will be 1." },
            { list: ["e._backward() propagates e.grad (which is 1) to c and d.", "c.grad += e.grad → c.grad = 1", "d.grad += e.grad → d.grad = 1"] },
            { heading: "For d:", description: "d is a composite function of a and b. The arithmetic operation between parent nodes is multiplication. So partial derivative with respect to a will be b and partial derivative with respect to b is a." },
            { list: ["d._backward() propagates d.grad to a and b.", "a.grad += b.data * d.grad → a.grad += 3.0 * 1 → a.grad = 3", "b.grad += a.data * d.grad → b.grad += 2.0 * 1 → b.grad = 2"] },
            { heading: "For c:", description: "d is a composite function of a and b. The arithmetic operation between parent nodes is addition. So the partial derivative with respect to parent nodes will be 1.", space: "Due to the effect of d a.grad = 3" },
            { space: "b.grad = 2", list: ["c._backward() propagates c.grad to a and b", "a.grad += c.grad → a.grad += 1 → a.grad = 4", "b.grad += c.grad → b.grad += 1 → b.grad = 3"], description: "The computational graph after backpropagation is given below." },
            { images: [andr7] },
            { heading: "nn.py", description: "To understand this file basic knowledge of neural network architecture is necessary. First I will explain neural network architecture." },
            {},
            {},
            {},
            {},
            {},
            {},
            {},
        ]
    },
]

const Blog = () => {
    const [posts, setPosts] = React.useState(posts_);


    return (
        <div className="blog-container">
            <header className="blog-header">
                <h1>Queryloop Blogs</h1>
            </header>
            <main className="blog-main">
                <BlogList posts={posts} setPosts={setPosts} />
            </main>
        </div>
    )
}

export default Blog;
export { posts_ };